22#include "llvm/IR/BasicBlock.h"
23#include "llvm/IR/Constants.h"
24#include "llvm/IR/DerivedTypes.h"
25#include "llvm/IR/ReplaceConstant.h"
26#include "llvm/Support/Format.h"
29using namespace CodeGen;
32constexpr unsigned CudaFatMagic = 0x466243b1;
33constexpr unsigned HIPFatMagic = 0x48495046;
38 llvm::IntegerType *IntTy, *SizeTy;
40 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
43 llvm::LLVMContext &Context;
45 llvm::Module &TheModule;
55 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
57 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
59 llvm::GlobalVariable *Var;
67 llvm::GlobalVariable *GpuBinaryHandle =
nullptr;
69 bool RelocatableDeviceCode;
71 std::unique_ptr<MangleContext> DeviceMC;
73 llvm::Constant *Zeros[2];
75 llvm::FunctionCallee getSetupArgumentFn()
const;
76 llvm::FunctionCallee getLaunchFn()
const;
78 llvm::FunctionType *getRegisterGlobalsFnTy()
const;
79 llvm::FunctionType *getCallbackFnTy()
const;
80 llvm::FunctionType *getRegisterLinkedBinaryFnTy()
const;
81 std::string addPrefixToName(StringRef FuncName)
const;
82 std::string addUnderscoredPrefixToName(StringRef FuncName)
const;
85 llvm::Function *makeRegisterGlobalsFn();
90 llvm::Constant *makeConstantString(
const std::string &Str,
91 const std::string &Name =
"") {
92 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
93 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
94 ConstStr.getPointer(), Zeros);
100 llvm::Constant *makeConstantArray(StringRef Str,
102 StringRef SectionName =
"",
103 unsigned Alignment = 0,
104 bool AddNull =
false) {
105 llvm::Constant *
Value =
106 llvm::ConstantDataArray::getString(Context, Str, AddNull);
107 auto *GV =
new llvm::GlobalVariable(
108 TheModule,
Value->getType(),
true,
109 llvm::GlobalValue::PrivateLinkage,
Value, Name);
110 if (!SectionName.empty()) {
111 GV->setSection(SectionName);
114 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
117 GV->setAlignment(llvm::Align(Alignment));
118 return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
122 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
123 assert(FnTy->getReturnType()->isVoidTy() &&
124 "Can only generate dummy functions returning void!");
125 llvm::Function *DummyFunc = llvm::Function::Create(
126 FnTy, llvm::GlobalValue::InternalLinkage,
"dummy", &TheModule);
128 llvm::BasicBlock *DummyBlock =
129 llvm::BasicBlock::Create(Context,
"", DummyFunc);
131 FuncBuilder.SetInsertPoint(DummyBlock);
132 FuncBuilder.CreateRetVoid();
141 void registerDeviceVar(
const VarDecl *VD, llvm::GlobalVariable &Var,
142 bool Extern,
bool Constant) {
143 DeviceVars.push_back({&Var,
146 VD->hasAttr<HIPManagedAttr>(),
149 void registerDeviceSurf(
const VarDecl *VD, llvm::GlobalVariable &Var,
150 bool Extern,
int Type) {
151 DeviceVars.push_back({&Var,
157 void registerDeviceTex(
const VarDecl *VD, llvm::GlobalVariable &Var,
158 bool Extern,
int Type,
bool Normalized) {
159 DeviceVars.push_back({&Var,
162 false, Normalized,
Type}});
166 llvm::Function *makeModuleCtorFunction();
168 llvm::Function *makeModuleDtorFunction();
170 void transformManagedVars();
172 void createOffloadingEntries();
178 llvm::Function *
getKernelStub(llvm::GlobalValue *Handle)
override {
179 auto Loc = KernelStubs.find(Handle);
180 assert(Loc != KernelStubs.end());
185 llvm::GlobalVariable &Var)
override;
188 llvm::GlobalValue::LinkageTypes &
Linkage)
override;
195std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName)
const {
196 if (CGM.getLangOpts().HIP)
197 return ((Twine(
"hip") + Twine(FuncName)).str());
198 return ((Twine(
"cuda") + Twine(FuncName)).str());
201CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName)
const {
202 if (CGM.getLangOpts().HIP)
203 return ((Twine(
"__hip") + Twine(FuncName)).str());
204 return ((Twine(
"__cuda") + Twine(FuncName)).str());
214 return std::unique_ptr<MangleContext>(
225 TheModule(CGM.getModule()),
226 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
234 Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
237 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.
CharTy));
238 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.
VoidPtrTy));
239 VoidPtrPtrTy = VoidPtrTy->getPointerTo();
242llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn()
const {
244 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
246 llvm::FunctionType::get(IntTy, Params,
false),
247 addPrefixToName(
"SetupArgument"));
250llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn()
const {
254 llvm::FunctionType::get(IntTy, CharPtrTy,
false),
"hipLaunchByPtr");
258 llvm::FunctionType::get(IntTy, CharPtrTy,
false),
"cudaLaunch");
261llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy()
const {
262 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy,
false);
265llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy()
const {
266 return llvm::FunctionType::get(VoidTy, VoidPtrTy,
false);
269llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy()
const {
270 auto *CallbackFnTy = getCallbackFnTy();
271 auto *RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
272 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
273 VoidPtrTy, CallbackFnTy->getPointerTo()};
274 return llvm::FunctionType::get(VoidTy, Params,
false);
277std::string CGNVCUDARuntime::getDeviceSideName(
const NamedDecl *ND) {
280 if (
auto *FD = dyn_cast<FunctionDecl>(ND))
281 GD =
GlobalDecl(FD, KernelReferenceKind::Kernel);
284 std::string DeviceSideName;
292 llvm::raw_svector_ostream Out(Buffer);
294 DeviceSideName = std::string(Out.str());
302 llvm::raw_svector_ostream Out(Buffer);
303 Out << DeviceSideName;
305 DeviceSideName = std::string(Out.str());
307 return DeviceSideName;
314 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.
CurFn->getName()])) {
315 GV->setLinkage(CGF.
CurFn->getLinkage());
316 GV->setInitializer(CGF.
CurFn);
319 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
321 emitDeviceStubBodyNew(CGF, Args);
323 emitDeviceStubBodyLegacy(CGF, Args);
337 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
339 for (
unsigned i = 0; i < Args.size(); ++i) {
341 llvm::Value *VoidVarPtr = CGF.
Builder.CreatePointerCast(VarPtr, VoidPtrTy);
361 std::string KernelLaunchAPI =
"LaunchKernel";
363 LangOptions::GPUDefaultStreamKind::PerThread)
364 KernelLaunchAPI = KernelLaunchAPI +
"_spt";
365 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
369 for (
auto *Result : DC->
lookup(&cudaLaunchKernelII)) {
371 cudaLaunchKernelFD = FD;
374 if (cudaLaunchKernelFD ==
nullptr) {
376 "Can't find declaration for " + LaunchKernelName);
391 llvm::FunctionType::get(IntTy,
397 addUnderscoredPrefixToName(
"PopCallConfiguration"));
405 KernelHandles[CGF.
CurFn->getName()], VoidPtrTy);
421 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
425 llvm::FunctionCallee cudaLaunchKernelFn =
437 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
440 for (
const VarDecl *A : Args) {
443 llvm::Value *Args[] = {
446 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
447 llvm::ConstantInt::get(SizeTy,
Offset.getQuantity()),
450 llvm::Constant *
Zero = llvm::ConstantInt::get(IntTy, 0);
451 llvm::Value *CBZero = CGF.
Builder.CreateICmpEQ(CB, Zero);
453 CGF.
Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
459 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
460 llvm::Value *Arg = CGF.
Builder.CreatePointerCast(
461 KernelHandles[CGF.
CurFn->getName()], CharPtrTy);
471 llvm::GlobalVariable *ManagedVar) {
473 for (
auto &&VarUse : Var->uses()) {
474 WorkList.push_back({VarUse.getUser()});
476 while (!WorkList.empty()) {
477 auto &&WorkItem = WorkList.pop_back_val();
478 auto *
U = WorkItem.back();
479 if (isa<llvm::ConstantExpr>(
U)) {
480 for (
auto &&UU :
U->uses()) {
481 WorkItem.push_back(UU.getUser());
482 WorkList.push_back(WorkItem);
487 if (
auto *I = dyn_cast<llvm::Instruction>(
U)) {
488 llvm::Value *OldV = Var;
489 llvm::Instruction *NewV =
490 new llvm::LoadInst(Var->getType(), ManagedVar,
"ld.managed",
false,
491 llvm::Align(Var->getAlignment()), I);
495 for (
auto &&Op : WorkItem) {
496 auto *CE = cast<llvm::ConstantExpr>(Op);
497 auto *NewInst = CE->getAsInstruction(I);
498 NewInst->replaceUsesOfWith(OldV, NewV);
502 I->replaceUsesOfWith(OldV, NewV);
504 llvm_unreachable(
"Invalid use of managed variable");
523llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
525 if (EmittedKernels.empty() && DeviceVars.empty())
528 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
529 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
530 addUnderscoredPrefixToName(
"_register_globals"), &TheModule);
531 llvm::BasicBlock *EntryBB =
532 llvm::BasicBlock::Create(Context,
"entry", RegisterKernelsFunc);
534 Builder.SetInsertPoint(EntryBB);
538 llvm::Type *RegisterFuncParams[] = {
539 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
540 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
542 llvm::FunctionType::get(IntTy, RegisterFuncParams,
false),
543 addUnderscoredPrefixToName(
"RegisterFunction"));
548 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
549 for (
auto &&I : EmittedKernels) {
550 llvm::Constant *KernelName =
551 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
552 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
553 llvm::Value *Args[] = {
555 Builder.CreateBitCast(KernelHandles[I.Kernel->getName()], VoidPtrTy),
558 llvm::ConstantInt::get(IntTy, -1),
563 llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
564 Builder.CreateCall(RegisterFunc, Args);
567 llvm::Type *VarSizeTy = IntTy;
575 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
576 CharPtrTy, IntTy, VarSizeTy,
579 llvm::FunctionType::get(VoidTy, RegisterVarParams,
false),
580 addUnderscoredPrefixToName(
"RegisterVar"));
583 llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
584 CharPtrTy, VarSizeTy, IntTy};
586 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams,
false),
587 addUnderscoredPrefixToName(
"RegisterManagedVar"));
591 llvm::FunctionType::get(
592 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
594 addUnderscoredPrefixToName(
"RegisterSurface"));
598 llvm::FunctionType::get(
600 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
602 addUnderscoredPrefixToName(
"RegisterTexture"));
603 for (
auto &&Info : DeviceVars) {
604 llvm::GlobalVariable *Var = Info.Var;
605 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
606 "External variables should not show up here, except HIP managed "
608 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
609 switch (Info.Flags.getKind()) {
610 case DeviceVarFlags::Variable: {
613 if (Info.Flags.isManaged()) {
614 auto *ManagedVar =
new llvm::GlobalVariable(
616 false, Var->getLinkage(),
619 : llvm::ConstantPointerNull::get(Var->getType()),
621 llvm::GlobalVariable::NotThreadLocal);
622 ManagedVar->setDSOLocal(Var->isDSOLocal());
623 ManagedVar->setVisibility(Var->getVisibility());
624 ManagedVar->setExternallyInitialized(
true);
625 ManagedVar->takeName(Var);
626 Var->setName(Twine(ManagedVar->getName() +
".managed"));
628 llvm::Value *Args[] = {
630 Builder.CreateBitCast(ManagedVar, VoidPtrTy),
631 Builder.CreateBitCast(Var, VoidPtrTy),
633 llvm::ConstantInt::get(VarSizeTy, VarSize),
634 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
635 if (!Var->isDeclaration())
636 Builder.CreateCall(RegisterManagedVar, Args);
638 llvm::Value *Args[] = {
640 Builder.CreateBitCast(Var, VoidPtrTy),
643 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
644 llvm::ConstantInt::get(VarSizeTy, VarSize),
645 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
646 llvm::ConstantInt::get(IntTy, 0)};
647 Builder.CreateCall(RegisterVar, Args);
651 case DeviceVarFlags::Surface:
654 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
655 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
656 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
658 case DeviceVarFlags::Texture:
661 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
662 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
663 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
664 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
669 Builder.CreateRetVoid();
670 return RegisterKernelsFunc;
692llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
697 if (CudaGpuBinaryFileName.empty() && !IsHIP)
699 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
704 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
707 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
708 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
712 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy,
false),
713 addUnderscoredPrefixToName(
"RegisterFatBinary"));
715 llvm::StructType *FatbinWrapperTy =
716 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
722 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary =
nullptr;
723 if (!CudaGpuBinaryFileName.empty()) {
724 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
725 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
726 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
728 << CudaGpuBinaryFileName << EC.message();
731 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
734 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
735 llvm::FunctionType::get(VoidTy,
false),
736 llvm::GlobalValue::InternalLinkage,
737 addUnderscoredPrefixToName(
"_module_ctor"), &TheModule);
738 llvm::BasicBlock *CtorEntryBB =
739 llvm::BasicBlock::Create(Context,
"entry", ModuleCtorFunc);
742 CtorBuilder.SetInsertPoint(CtorEntryBB);
744 const char *FatbinConstantName;
745 const char *FatbinSectionName;
746 const char *ModuleIDSectionName;
747 StringRef ModuleIDPrefix;
748 llvm::Constant *FatBinStr;
751 FatbinConstantName =
".hip_fatbin";
752 FatbinSectionName =
".hipFatBinSegment";
754 ModuleIDSectionName =
"__hip_module_id";
755 ModuleIDPrefix =
"__hip_";
760 const unsigned HIPCodeObjectAlign = 4096;
761 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
762 FatbinConstantName, HIPCodeObjectAlign);
768 FatBinStr =
new llvm::GlobalVariable(
770 true, llvm::GlobalValue::ExternalLinkage,
nullptr,
771 "__hip_fatbin",
nullptr,
772 llvm::GlobalVariable::NotThreadLocal);
773 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
776 FatMagic = HIPFatMagic;
778 if (RelocatableDeviceCode)
779 FatbinConstantName = CGM.
getTriple().isMacOSX()
780 ?
"__NV_CUDA,__nv_relfatbin"
784 CGM.
getTriple().isMacOSX() ?
"__NV_CUDA,__nv_fatbin" :
".nv_fatbin";
787 CGM.
getTriple().isMacOSX() ?
"__NV_CUDA,__fatbin" :
".nvFatBinSegment";
789 ModuleIDSectionName = CGM.
getTriple().isMacOSX()
790 ?
"__NV_CUDA,__nv_module_id"
792 ModuleIDPrefix =
"__nv_";
796 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
797 FatbinConstantName, 8);
798 FatMagic = CudaFatMagic;
803 auto Values = Builder.beginStruct(FatbinWrapperTy);
805 Values.addInt(IntTy, FatMagic);
807 Values.addInt(IntTy, 1);
809 Values.add(FatBinStr);
811 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
812 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
815 FatbinWrapper->setSection(FatbinSectionName);
825 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
826 llvm::GlobalValue::LinkOnceAnyLinkage;
827 llvm::BasicBlock *IfBlock =
828 llvm::BasicBlock::Create(Context,
"if", ModuleCtorFunc);
829 llvm::BasicBlock *ExitBlock =
830 llvm::BasicBlock::Create(Context,
"exit", ModuleCtorFunc);
833 GpuBinaryHandle =
new llvm::GlobalVariable(
834 TheModule, VoidPtrPtrTy,
false,
836 llvm::ConstantPointerNull::get(VoidPtrPtrTy),
837 "__hip_gpubin_handle");
838 if (
Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
839 GpuBinaryHandle->setComdat(
840 CGM.
getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
843 if (
Linkage != llvm::GlobalValue::InternalLinkage)
844 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
846 GpuBinaryHandle, VoidPtrPtrTy,
849 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
850 llvm::Constant *
Zero =
851 llvm::Constant::getNullValue(HandleValue->getType());
852 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
853 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
856 CtorBuilder.SetInsertPoint(IfBlock);
858 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
860 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
861 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
862 CtorBuilder.CreateBr(ExitBlock);
865 CtorBuilder.SetInsertPoint(ExitBlock);
867 if (RegisterGlobalsFunc) {
868 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
869 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
872 }
else if (!RelocatableDeviceCode) {
876 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
878 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
879 GpuBinaryHandle =
new llvm::GlobalVariable(
880 TheModule, VoidPtrPtrTy,
false, llvm::GlobalValue::InternalLinkage,
881 llvm::ConstantPointerNull::get(VoidPtrPtrTy),
"__cuda_gpubin_handle");
883 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
887 if (RegisterGlobalsFunc)
888 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
892 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
895 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy,
false),
896 "__cudaRegisterFatBinaryEnd");
897 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
902 llvm::raw_svector_ostream
OS(ModuleID);
903 OS << ModuleIDPrefix << llvm::format(
"%" PRIx64, FatbinWrapper->getGUID());
904 llvm::Constant *ModuleIDConstant = makeConstantArray(
905 std::string(ModuleID.str()),
"", ModuleIDSectionName, 32,
true);
908 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
909 Twine(
"__fatbinwrap") + ModuleID, FatbinWrapper);
914 RegisterLinkedBinaryName += ModuleID;
916 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
918 assert(RegisterGlobalsFunc &&
"Expecting at least dummy function!");
919 llvm::Value *Args[] = {RegisterGlobalsFunc,
920 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
922 makeDummyFunction(getCallbackFnTy())};
923 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
929 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
931 llvm::FunctionType *AtExitTy =
932 llvm::FunctionType::get(IntTy, CleanupFn->getType(),
false);
933 llvm::FunctionCallee AtExitFunc =
936 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
939 CtorBuilder.CreateRetVoid();
940 return ModuleCtorFunc;
962llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
964 if (!GpuBinaryHandle)
969 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy,
false),
970 addUnderscoredPrefixToName(
"UnregisterFatBinary"));
972 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
973 llvm::FunctionType::get(VoidTy,
false),
974 llvm::GlobalValue::InternalLinkage,
975 addUnderscoredPrefixToName(
"_module_dtor"), &TheModule);
977 llvm::BasicBlock *DtorEntryBB =
978 llvm::BasicBlock::Create(Context,
"entry", ModuleDtorFunc);
980 DtorBuilder.SetInsertPoint(DtorEntryBB);
983 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
985 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
990 llvm::BasicBlock *IfBlock =
991 llvm::BasicBlock::Create(Context,
"if", ModuleDtorFunc);
992 llvm::BasicBlock *ExitBlock =
993 llvm::BasicBlock::Create(Context,
"exit", ModuleDtorFunc);
994 llvm::Constant *
Zero = llvm::Constant::getNullValue(HandleValue->getType());
995 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
996 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
998 DtorBuilder.SetInsertPoint(IfBlock);
999 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1000 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1001 DtorBuilder.CreateBr(ExitBlock);
1003 DtorBuilder.SetInsertPoint(ExitBlock);
1005 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1007 DtorBuilder.CreateRetVoid();
1008 return ModuleDtorFunc;
1012 return new CGNVCUDARuntime(CGM);
1015void CGNVCUDARuntime::internalizeDeviceSideVar(
1032 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>() ||
1033 D->
hasAttr<CUDASharedAttr>() ||
1036 Linkage = llvm::GlobalValue::InternalLinkage;
1040void CGNVCUDARuntime::handleVarRegistration(
const VarDecl *D,
1041 llvm::GlobalVariable &GV) {
1042 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>()) {
1058 D->
hasAttr<HIPManagedAttr>()) {
1060 D->
hasAttr<CUDAConstantAttr>());
1066 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1069 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1070 assert(Args.
size() == 2 &&
1071 "Unexpected number of template arguments of CUDA device "
1072 "builtin surface type.");
1073 auto SurfType = Args[1].getAsIntegral();
1075 registerDeviceSurf(D, GV, !D->
hasDefinition(), SurfType.getSExtValue());
1077 assert(Args.
size() == 3 &&
1078 "Unexpected number of template arguments of CUDA device "
1079 "builtin texture type.");
1080 auto TexType = Args[1].getAsIntegral();
1081 auto Normalized = Args[2].getAsIntegral();
1083 registerDeviceTex(D, GV, !D->
hasDefinition(), TexType.getSExtValue(),
1084 Normalized.getZExtValue());
1093void CGNVCUDARuntime::transformManagedVars() {
1094 for (
auto &&Info : DeviceVars) {
1095 llvm::GlobalVariable *Var = Info.Var;
1096 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1097 Info.Flags.isManaged()) {
1098 auto *ManagedVar =
new llvm::GlobalVariable(
1100 false, Var->getLinkage(),
1101 Var->isDeclaration()
1103 : llvm::ConstantPointerNull::get(Var->getType()),
1105 llvm::GlobalVariable::NotThreadLocal,
1107 ManagedVar->setDSOLocal(Var->isDSOLocal());
1108 ManagedVar->setVisibility(Var->getVisibility());
1109 ManagedVar->setExternallyInitialized(
true);
1111 ManagedVar->takeName(Var);
1112 Var->setName(Twine(ManagedVar->getName()) +
".managed");
1115 if (!Var->isDeclaration()) {
1116 assert(!ManagedVar->isDeclaration());
1127void CGNVCUDARuntime::createOffloadingEntries() {
1128 llvm::OpenMPIRBuilder OMPBuilder(CGM.
getModule());
1129 OMPBuilder.initialize();
1131 StringRef Section = CGM.
getLangOpts().HIP ?
"hip_offloading_entries"
1132 :
"cuda_offloading_entries";
1133 for (KernelInfo &I : EmittedKernels)
1134 OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel->getName()],
1135 getDeviceSideName(cast<NamedDecl>(I.D)), 0,
1136 DeviceVarFlags::OffloadGlobalEntry, Section);
1138 for (VarInfo &I : DeviceVars) {
1140 CGM.
getDataLayout().getTypeAllocSize(I.Var->getValueType());
1141 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1142 OMPBuilder.emitOffloadingEntry(
1143 I.Var, getDeviceSideName(I.D), VarSize,
1144 I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry
1145 : DeviceVarFlags::OffloadGlobalEntry,
1147 }
else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1148 OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize,
1149 DeviceVarFlags::OffloadGlobalSurfaceEntry,
1151 }
else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1152 OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize,
1153 DeviceVarFlags::OffloadGlobalTextureEntry,
1160llvm::Function *CGNVCUDARuntime::finalizeModule() {
1162 transformManagedVars();
1174 for (
auto &&Info : DeviceVars) {
1175 auto Kind = Info.Flags.getKind();
1176 if (!Info.Var->isDeclaration() &&
1177 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1178 (Kind == DeviceVarFlags::Variable ||
1179 Kind == DeviceVarFlags::Surface ||
1180 Kind == DeviceVarFlags::Texture) &&
1181 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1187 if (CGM.
getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1188 createOffloadingEntries();
1190 return makeModuleCtorFunction();
1195llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1197 auto Loc = KernelHandles.find(F->getName());
1198 if (Loc != KernelHandles.end()) {
1199 auto OldHandle = Loc->second;
1200 if (KernelStubs[OldHandle] == F)
1208 KernelStubs[OldHandle] = F;
1213 KernelStubs.erase(OldHandle);
1217 KernelHandles[F->getName()] = F;
1222 auto *Var =
new llvm::GlobalVariable(
1223 TheModule, F->getType(),
true, F->getLinkage(),
1228 Var->setDSOLocal(F->isDSOLocal());
1229 Var->setVisibility(F->getVisibility());
1231 KernelHandles[F->getName()] = Var;
1232 KernelStubs[Var] = F;
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
TranslationUnitDecl * getTranslationUnitDecl() const
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
bool shouldExternalize(const Decl *D) const
Whether a C++ static variable or CUDA/HIP kernel should be externalized.
const TargetInfo * getAuxTargetInfo() const
llvm::DenseSet< const VarDecl * > CUDADeviceVarODRUsedByHost
Keep track of CUDA/HIP device-side variables ODR-used by host code.
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
TypeInfoChars getTypeInfoInChars(const Type *T) const
const TargetInfo & getTargetInfo() const
unsigned getTargetAddressSpace(LangAS AS) const
CharUnits - This is an opaque type for sizes expressed in character units.
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
std::string CudaGpuBinaryFileName
Name of file passed with -fcuda-include-gpubinary option to forward to CUDA runtime back-end for inco...
llvm::Value * getPointer() const
llvm::PointerType * getType() const
Return the type of the pointer value.
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
MangleContext & getMangleContext()
Gets the mangle context.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
void add(RValue rvalue, QualType type)
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc)
EmitCall - Generate a call of the given function, expecting the given result type,...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
llvm::FunctionCallee CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false, bool AssumeConvergent=false)
Create or return a runtime function declaration with the specified type and name.
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
DiagnosticsEngine & getDiags() const
const LangOptions & getLangOpts() const
CodeGenTypes & getTypes()
const TargetInfo & getTarget() const
const llvm::DataLayout & getDataLayout() const
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
CGCXXABI & getCXXABI() const
const llvm::Triple & getTriple() const
ASTContext & getContext() const
const CodeGenOptions & getCodeGenOpts() const
StringRef getMangledName(GlobalDecl GD)
void maybeSetTrivialComdat(const Decl &D, llvm::GlobalObject &GO)
void printPostfixForExternalizedDecl(llvm::raw_ostream &OS, const Decl *D) const
Print the postfix for externalized static variable or kernels for single source offloading languages ...
This class organizes the cross-module state that is used while lowering AST types to LLVM types.
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
const CGFunctionInfo & arrangeFunctionDeclaration(const FunctionDecl *FD)
Free functions are functions that are compatible with an ordinary C function pointer type.
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
static RValue get(llvm::Value *V)
static RValue getAggregate(Address addr, bool isVolatile=false)
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Decl - This represents one declaration (or definition), e.g.
SourceLocation getLocation() const
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Represents a function declaration or definition.
const ParmVarDecl * getParamDecl(unsigned i) const
GlobalDecl - represents a global declaration.
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
const Decl * getDecl() const
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
bool shouldMangleDeclName(const NamedDecl *D)
void mangleName(GlobalDecl GD, raw_ostream &)
This represents a decl that may have a name.
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Represents a parameter to a function.
A (possibly-)qualified type.
QualType getCanonicalType() const
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
RecordDecl * getDecl() const
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
const llvm::VersionTuple & getSDKVersion() const
A template argument list.
unsigned size() const
Retrieve the number of template arguments in this template argument list.
The top declaration context.
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
The base class of the type hierarchy.
const T * castAs() const
Member-template castAs<specific type>.
bool isCUDADeviceBuiltinSurfaceType() const
Check if the type is the CUDA device builtin surface type.
bool isCUDADeviceBuiltinTextureType() const
Check if the type is the CUDA device builtin texture type.
Represents a variable declaration or definition.
bool isInline() const
Whether this variable is (C++1z) inline.
bool hasExternalStorage() const
Returns true if a variable has extern or private_extern storage.
DefinitionKind hasDefinition(ASTContext &) const
Check whether this variable is defined in this translation unit.
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
@ OS
Indicates that the tracking object is a descendant of a referenced-counted OSObject,...
bool Zero(InterpState &S, CodePtr OpPC)
CudaVersion ToCudaVersion(llvm::VersionTuple)
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
llvm::IntegerType * SizeTy
llvm::IntegerType * IntTy
int
CharUnits getSizeAlign() const
CharUnits getPointerAlign() const