23#include "llvm/ADT/StringRef.h"
24#include "llvm/Frontend/Offloading/Utility.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/DerivedTypes.h"
28#include "llvm/IR/ReplaceConstant.h"
29#include "llvm/Support/Format.h"
30#include "llvm/Support/VirtualFileSystem.h"
33using namespace CodeGen;
36constexpr unsigned CudaFatMagic = 0x466243b1;
37constexpr unsigned HIPFatMagic = 0x48495046;
44 StringRef SectionPrefix;
47 llvm::IntegerType *IntTy, *SizeTy;
49 llvm::PointerType *PtrTy;
52 llvm::LLVMContext &Context;
54 llvm::Module &TheModule;
64 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
66 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
68 llvm::GlobalVariable *Var;
76 llvm::GlobalVariable *GpuBinaryHandle =
nullptr;
78 bool RelocatableDeviceCode;
80 std::unique_ptr<MangleContext> DeviceMC;
82 llvm::FunctionCallee getSetupArgumentFn()
const;
83 llvm::FunctionCallee getLaunchFn()
const;
85 llvm::FunctionType *getRegisterGlobalsFnTy()
const;
86 llvm::FunctionType *getCallbackFnTy()
const;
87 llvm::FunctionType *getRegisterLinkedBinaryFnTy()
const;
88 std::string addPrefixToName(StringRef FuncName)
const;
89 std::string addUnderscoredPrefixToName(StringRef FuncName)
const;
92 llvm::Function *makeRegisterGlobalsFn();
97 llvm::Constant *makeConstantString(
const std::string &Str,
98 const std::string &Name =
"") {
99 return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer();
105 llvm::Constant *makeConstantArray(StringRef Str,
107 StringRef SectionName =
"",
108 unsigned Alignment = 0,
109 bool AddNull =
false) {
110 llvm::Constant *
Value =
111 llvm::ConstantDataArray::getString(Context, Str, AddNull);
112 auto *GV =
new llvm::GlobalVariable(
114 llvm::GlobalValue::PrivateLinkage,
Value, Name);
115 if (!SectionName.empty()) {
116 GV->setSection(SectionName);
119 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
122 GV->setAlignment(llvm::Align(Alignment));
127 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
128 assert(FnTy->getReturnType()->isVoidTy() &&
129 "Can only generate dummy functions returning void!");
130 llvm::Function *DummyFunc = llvm::Function::Create(
131 FnTy, llvm::GlobalValue::InternalLinkage,
"dummy", &TheModule);
133 llvm::BasicBlock *DummyBlock =
134 llvm::BasicBlock::Create(Context,
"", DummyFunc);
136 FuncBuilder.SetInsertPoint(DummyBlock);
137 FuncBuilder.CreateRetVoid();
149 void registerDeviceVar(
const VarDecl *VD, llvm::GlobalVariable &Var,
150 bool Extern,
bool Constant) {
151 DeviceVars.push_back({&Var,
154 VD->hasAttr<HIPManagedAttr>(),
157 void registerDeviceSurf(
const VarDecl *VD, llvm::GlobalVariable &Var,
158 bool Extern,
int Type) {
159 DeviceVars.push_back({&Var,
165 void registerDeviceTex(
const VarDecl *VD, llvm::GlobalVariable &Var,
166 bool Extern,
int Type,
bool Normalized) {
167 DeviceVars.push_back({&Var,
170 false, Normalized,
Type}});
174 llvm::Function *makeModuleCtorFunction();
176 llvm::Function *makeModuleDtorFunction();
178 void transformManagedVars();
180 void createOffloadingEntries();
186 llvm::Function *
getKernelStub(llvm::GlobalValue *Handle)
override {
187 auto Loc = KernelStubs.find(Handle);
188 assert(
Loc != KernelStubs.end());
193 llvm::GlobalVariable &Var)
override;
196 llvm::GlobalValue::LinkageTypes &
Linkage)
override;
203std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName)
const {
204 return (Prefix + FuncName).str();
207CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName)
const {
208 return (
"__" + Prefix + FuncName).str();
218 return std::unique_ptr<MangleContext>(
229 TheModule(CGM.getModule()),
230 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
239 SectionPrefix =
"omp";
241 SectionPrefix = Prefix =
"hip";
243 SectionPrefix = Prefix =
"cuda";
246llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn()
const {
248 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
250 llvm::FunctionType::get(IntTy, Params,
false),
251 addPrefixToName(
"SetupArgument"));
254llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn()
const {
258 llvm::FunctionType::get(IntTy, PtrTy,
false),
"hipLaunchByPtr");
265llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy()
const {
266 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
269llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy()
const {
270 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
273llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy()
const {
274 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
275 llvm::PointerType::getUnqual(Context)};
276 return llvm::FunctionType::get(VoidTy, Params,
false);
279std::string CGNVCUDARuntime::getDeviceSideName(
const NamedDecl *ND) {
282 if (
auto *FD = dyn_cast<FunctionDecl>(ND))
283 GD =
GlobalDecl(FD, KernelReferenceKind::Kernel);
286 std::string DeviceSideName;
294 llvm::raw_svector_ostream Out(Buffer);
296 DeviceSideName = std::string(Out.str());
304 llvm::raw_svector_ostream Out(Buffer);
305 Out << DeviceSideName;
307 DeviceSideName = std::string(Out.str());
309 return DeviceSideName;
316 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.
CurFn->getName()])) {
317 GV->setLinkage(CGF.
CurFn->getLinkage());
318 GV->setInitializer(CGF.
CurFn);
321 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
324 emitDeviceStubBodyNew(CGF, Args);
326 emitDeviceStubBodyLegacy(CGF, Args);
339 for (
auto &Arg : Args)
341 llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);
343 auto *Int64Ty = CGF.
Builder.getInt64Ty();
344 KernelLaunchParamsTypes.push_back(Int64Ty);
345 KernelLaunchParamsTypes.push_back(PtrTy);
346 KernelLaunchParamsTypes.push_back(PtrTy);
348 llvm::StructType *KernelLaunchParamsTy =
349 llvm::StructType::create(KernelLaunchParamsTypes);
354 "kernel_launch_params");
356 auto KernelArgsSize = CGM.
getDataLayout().getTypeAllocSize(KernelArgsTy);
364 for (
unsigned i = 0; i < Args.size(); ++i) {
369 return KernelLaunchParams;
379 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
381 for (
unsigned i = 0; i < Args.size(); ++i) {
383 llvm::Value *VoidVarPtr = CGF.
Builder.CreatePointerCast(VarPtr, PtrTy);
385 VoidVarPtr, CGF.
Builder.CreateConstGEP1_32(
397 ? prepareKernelArgsLLVMOffload(CGF, Args)
398 : prepareKernelArgs(CGF, Args);
414 std::string KernelLaunchAPI =
"LaunchKernel";
416 LangOptions::GPUDefaultStreamKind::PerThread) {
418 KernelLaunchAPI = KernelLaunchAPI +
"_spt";
420 KernelLaunchAPI = KernelLaunchAPI +
"_ptsz";
422 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
426 for (
auto *Result : DC->
lookup(&cudaLaunchKernelII)) {
428 cudaLaunchKernelFD = FD;
431 if (cudaLaunchKernelFD ==
nullptr) {
433 "Can't find declaration for " + LaunchKernelName);
447 llvm::FunctionType::get(IntTy,
453 addUnderscoredPrefixToName(
"PopCallConfiguration"));
462 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
478 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
482 llvm::FunctionCallee cudaLaunchKernelFn =
492 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(
Kernel);
493 std::string GlobalVarName = (KernelFunction->getName() +
".id").str();
495 llvm::GlobalVariable *HandleVar =
496 CGM.
getModule().getNamedGlobal(GlobalVarName);
498 HandleVar =
new llvm::GlobalVariable(
500 false, KernelFunction->getLinkage(),
501 llvm::ConstantInt::get(CGM.
Int8Ty, 0), GlobalVarName);
502 HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
503 HandleVar->setVisibility(KernelFunction->getVisibility());
504 if (KernelFunction->hasComdat())
505 HandleVar->setComdat(CGM.
getModule().getOrInsertComdat(GlobalVarName));
521 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
524 for (
const VarDecl *A : Args) {
526 Offset = Offset.alignTo(TInfo.Align);
527 llvm::Value *Args[] = {
530 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
531 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
534 llvm::Constant *
Zero = llvm::ConstantInt::get(IntTy, 0);
535 llvm::Value *CBZero = CGF.
Builder.CreateICmpEQ(CB, Zero);
537 CGF.
Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
539 Offset += TInfo.Width;
543 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
545 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
555 llvm::GlobalVariable *ManagedVar) {
557 for (
auto &&VarUse : Var->uses()) {
558 WorkList.push_back({VarUse.getUser()});
560 while (!WorkList.empty()) {
561 auto &&WorkItem = WorkList.pop_back_val();
562 auto *
U = WorkItem.back();
563 if (isa<llvm::ConstantExpr>(
U)) {
564 for (
auto &&UU :
U->uses()) {
565 WorkItem.push_back(UU.getUser());
566 WorkList.push_back(WorkItem);
571 if (
auto *I = dyn_cast<llvm::Instruction>(
U)) {
572 llvm::Value *OldV = Var;
573 llvm::Instruction *NewV =
new llvm::LoadInst(
574 Var->getType(), ManagedVar,
"ld.managed",
false,
575 llvm::Align(Var->getAlignment()), I->getIterator());
579 for (
auto &&Op : WorkItem) {
580 auto *CE = cast<llvm::ConstantExpr>(Op);
581 auto *NewInst = CE->getAsInstruction();
582 NewInst->insertBefore(*I->getParent(), I->getIterator());
583 NewInst->replaceUsesOfWith(OldV, NewV);
587 I->replaceUsesOfWith(OldV, NewV);
589 llvm_unreachable(
"Invalid use of managed variable");
608llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
610 if (EmittedKernels.empty() && DeviceVars.empty())
613 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
614 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
615 addUnderscoredPrefixToName(
"_register_globals"), &TheModule);
616 llvm::BasicBlock *EntryBB =
617 llvm::BasicBlock::Create(Context,
"entry", RegisterKernelsFunc);
619 Builder.SetInsertPoint(EntryBB);
623 llvm::Type *RegisterFuncParams[] = {
624 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
625 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
627 llvm::FunctionType::get(IntTy, RegisterFuncParams,
false),
628 addUnderscoredPrefixToName(
"RegisterFunction"));
633 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
634 for (
auto &&I : EmittedKernels) {
635 llvm::Constant *KernelName =
636 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
637 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
638 llvm::Value *Args[] = {
640 KernelHandles[I.Kernel->getName()],
643 llvm::ConstantInt::get(IntTy, -1),
648 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
649 Builder.CreateCall(RegisterFunc, Args);
652 llvm::Type *VarSizeTy = IntTy;
660 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
661 IntTy, VarSizeTy, IntTy, IntTy};
663 llvm::FunctionType::get(VoidTy, RegisterVarParams,
false),
664 addUnderscoredPrefixToName(
"RegisterVar"));
667 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
668 PtrTy, VarSizeTy, IntTy};
670 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams,
false),
671 addUnderscoredPrefixToName(
"RegisterManagedVar"));
675 llvm::FunctionType::get(
676 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy},
false),
677 addUnderscoredPrefixToName(
"RegisterSurface"));
681 llvm::FunctionType::get(
682 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy},
false),
683 addUnderscoredPrefixToName(
"RegisterTexture"));
684 for (
auto &&Info : DeviceVars) {
685 llvm::GlobalVariable *Var = Info.Var;
686 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
687 "External variables should not show up here, except HIP managed "
689 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
690 switch (Info.Flags.getKind()) {
691 case DeviceVarFlags::Variable: {
694 if (Info.Flags.isManaged()) {
695 assert(Var->getName().ends_with(
".managed") &&
696 "HIP managed variables not transformed");
697 auto *ManagedVar = CGM.
getModule().getNamedGlobal(
698 Var->getName().drop_back(StringRef(
".managed").size()));
699 llvm::Value *Args[] = {
704 llvm::ConstantInt::get(VarSizeTy, VarSize),
705 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
706 if (!Var->isDeclaration())
707 Builder.CreateCall(RegisterManagedVar, Args);
709 llvm::Value *Args[] = {
714 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
715 llvm::ConstantInt::get(VarSizeTy, VarSize),
716 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
717 llvm::ConstantInt::get(IntTy, 0)};
718 Builder.CreateCall(RegisterVar, Args);
722 case DeviceVarFlags::Surface:
725 {&GpuBinaryHandlePtr, Var, VarName, VarName,
726 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
727 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
729 case DeviceVarFlags::Texture:
732 {&GpuBinaryHandlePtr, Var, VarName, VarName,
733 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
734 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
735 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
740 Builder.CreateRetVoid();
741 return RegisterKernelsFunc;
763llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
768 if (CudaGpuBinaryFileName.empty() && !IsHIP)
770 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
775 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
778 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
779 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
783 llvm::FunctionType::get(PtrTy, PtrTy,
false),
784 addUnderscoredPrefixToName(
"RegisterFatBinary"));
786 llvm::StructType *FatbinWrapperTy =
787 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
793 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary =
nullptr;
794 if (!CudaGpuBinaryFileName.empty()) {
796 auto CudaGpuBinaryOrErr =
797 VFS->getBufferForFile(CudaGpuBinaryFileName, -1,
false);
798 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
800 << CudaGpuBinaryFileName << EC.message();
803 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
806 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
807 llvm::FunctionType::get(VoidTy,
false),
808 llvm::GlobalValue::InternalLinkage,
809 addUnderscoredPrefixToName(
"_module_ctor"), &TheModule);
810 llvm::BasicBlock *CtorEntryBB =
811 llvm::BasicBlock::Create(Context,
"entry", ModuleCtorFunc);
814 CtorBuilder.SetInsertPoint(CtorEntryBB);
816 const char *FatbinConstantName;
817 const char *FatbinSectionName;
818 const char *ModuleIDSectionName;
819 StringRef ModuleIDPrefix;
820 llvm::Constant *FatBinStr;
823 FatbinConstantName =
".hip_fatbin";
824 FatbinSectionName =
".hipFatBinSegment";
826 ModuleIDSectionName =
"__hip_module_id";
827 ModuleIDPrefix =
"__hip_";
832 const unsigned HIPCodeObjectAlign = 4096;
833 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
834 FatbinConstantName, HIPCodeObjectAlign);
840 FatBinStr =
new llvm::GlobalVariable(
842 true, llvm::GlobalValue::ExternalLinkage,
nullptr,
844 llvm::GlobalVariable::NotThreadLocal);
845 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
848 FatMagic = HIPFatMagic;
850 if (RelocatableDeviceCode)
851 FatbinConstantName = CGM.
getTriple().isMacOSX()
852 ?
"__NV_CUDA,__nv_relfatbin"
856 CGM.
getTriple().isMacOSX() ?
"__NV_CUDA,__nv_fatbin" :
".nv_fatbin";
859 CGM.
getTriple().isMacOSX() ?
"__NV_CUDA,__fatbin" :
".nvFatBinSegment";
861 ModuleIDSectionName = CGM.
getTriple().isMacOSX()
862 ?
"__NV_CUDA,__nv_module_id"
864 ModuleIDPrefix =
"__nv_";
868 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
869 FatbinConstantName, 8);
870 FatMagic = CudaFatMagic;
875 auto Values = Builder.beginStruct(FatbinWrapperTy);
877 Values.addInt(IntTy, FatMagic);
879 Values.addInt(IntTy, 1);
881 Values.add(FatBinStr);
883 Values.add(llvm::ConstantPointerNull::get(PtrTy));
884 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
887 FatbinWrapper->setSection(FatbinSectionName);
897 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
898 : llvm::GlobalValue::ExternalLinkage;
899 llvm::BasicBlock *IfBlock =
900 llvm::BasicBlock::Create(Context,
"if", ModuleCtorFunc);
901 llvm::BasicBlock *ExitBlock =
902 llvm::BasicBlock::Create(Context,
"exit", ModuleCtorFunc);
905 GpuBinaryHandle =
new llvm::GlobalVariable(
906 TheModule, PtrTy,
false,
Linkage,
908 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) :
nullptr,
910 ?
"__hip_gpubin_handle"
914 if (
Linkage != llvm::GlobalValue::InternalLinkage)
915 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
917 GpuBinaryHandle, PtrTy,
920 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
921 llvm::Constant *
Zero =
922 llvm::Constant::getNullValue(HandleValue->getType());
923 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
924 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
927 CtorBuilder.SetInsertPoint(IfBlock);
929 llvm::CallInst *RegisterFatbinCall =
930 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
931 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
932 CtorBuilder.CreateBr(ExitBlock);
935 CtorBuilder.SetInsertPoint(ExitBlock);
937 if (RegisterGlobalsFunc) {
938 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
939 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
942 }
else if (!RelocatableDeviceCode) {
946 llvm::CallInst *RegisterFatbinCall =
947 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
948 GpuBinaryHandle =
new llvm::GlobalVariable(
949 TheModule, PtrTy,
false, llvm::GlobalValue::InternalLinkage,
950 llvm::ConstantPointerNull::get(PtrTy),
"__cuda_gpubin_handle");
952 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
956 if (RegisterGlobalsFunc)
957 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
961 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
964 llvm::FunctionType::get(VoidTy, PtrTy,
false),
965 "__cudaRegisterFatBinaryEnd");
966 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
971 llvm::raw_svector_ostream OS(ModuleID);
972 OS << ModuleIDPrefix << llvm::format(
"%" PRIx64, FatbinWrapper->getGUID());
973 llvm::Constant *ModuleIDConstant = makeConstantArray(
974 std::string(ModuleID),
"", ModuleIDSectionName, 32,
true);
977 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
978 Twine(
"__fatbinwrap") + ModuleID, FatbinWrapper);
983 RegisterLinkedBinaryName += ModuleID;
985 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
987 assert(RegisterGlobalsFunc &&
"Expecting at least dummy function!");
988 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
989 makeDummyFunction(getCallbackFnTy())};
990 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
996 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
998 llvm::FunctionType *AtExitTy =
999 llvm::FunctionType::get(IntTy, CleanupFn->getType(),
false);
1000 llvm::FunctionCallee AtExitFunc =
1003 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
1006 CtorBuilder.CreateRetVoid();
1007 return ModuleCtorFunc;
1029llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
1031 if (!GpuBinaryHandle)
1036 llvm::FunctionType::get(VoidTy, PtrTy,
false),
1037 addUnderscoredPrefixToName(
"UnregisterFatBinary"));
1039 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
1040 llvm::FunctionType::get(VoidTy,
false),
1041 llvm::GlobalValue::InternalLinkage,
1042 addUnderscoredPrefixToName(
"_module_dtor"), &TheModule);
1044 llvm::BasicBlock *DtorEntryBB =
1045 llvm::BasicBlock::Create(Context,
"entry", ModuleDtorFunc);
1047 DtorBuilder.SetInsertPoint(DtorEntryBB);
1050 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
1052 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
1057 llvm::BasicBlock *IfBlock =
1058 llvm::BasicBlock::Create(Context,
"if", ModuleDtorFunc);
1059 llvm::BasicBlock *ExitBlock =
1060 llvm::BasicBlock::Create(Context,
"exit", ModuleDtorFunc);
1061 llvm::Constant *
Zero = llvm::Constant::getNullValue(HandleValue->getType());
1062 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
1063 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
1065 DtorBuilder.SetInsertPoint(IfBlock);
1066 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1067 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1068 DtorBuilder.CreateBr(ExitBlock);
1070 DtorBuilder.SetInsertPoint(ExitBlock);
1072 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1074 DtorBuilder.CreateRetVoid();
1075 return ModuleDtorFunc;
1079 return new CGNVCUDARuntime(CGM);
1082void CGNVCUDARuntime::internalizeDeviceSideVar(
1101 D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1102 D->getType()->isCUDADeviceBuiltinTextureType()) {
1103 Linkage = llvm::GlobalValue::InternalLinkage;
1107void CGNVCUDARuntime::handleVarRegistration(
const VarDecl *
D,
1108 llvm::GlobalVariable &GV) {
1123 if ((!
D->hasExternalStorage() && !
D->isInline()) ||
1126 registerDeviceVar(
D, GV, !
D->hasDefinition(),
1129 }
else if (
D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1130 D->getType()->isCUDADeviceBuiltinTextureType()) {
1133 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1136 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1137 assert(Args.
size() == 2 &&
1138 "Unexpected number of template arguments of CUDA device "
1139 "builtin surface type.");
1140 auto SurfType = Args[1].getAsIntegral();
1141 if (!
D->hasExternalStorage())
1142 registerDeviceSurf(
D, GV, !
D->hasDefinition(), SurfType.getSExtValue());
1144 assert(Args.
size() == 3 &&
1145 "Unexpected number of template arguments of CUDA device "
1146 "builtin texture type.");
1147 auto TexType = Args[1].getAsIntegral();
1148 auto Normalized = Args[2].getAsIntegral();
1149 if (!
D->hasExternalStorage())
1150 registerDeviceTex(
D, GV, !
D->hasDefinition(), TexType.getSExtValue(),
1151 Normalized.getZExtValue());
1160void CGNVCUDARuntime::transformManagedVars() {
1161 for (
auto &&Info : DeviceVars) {
1162 llvm::GlobalVariable *Var = Info.Var;
1163 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1164 Info.Flags.isManaged()) {
1165 auto *ManagedVar =
new llvm::GlobalVariable(
1167 false, Var->getLinkage(),
1168 Var->isDeclaration()
1170 : llvm::ConstantPointerNull::get(Var->getType()),
1172 llvm::GlobalVariable::NotThreadLocal,
1174 ? LangAS::cuda_device
1175 : LangAS::Default));
1176 ManagedVar->setDSOLocal(Var->isDSOLocal());
1177 ManagedVar->setVisibility(Var->getVisibility());
1178 ManagedVar->setExternallyInitialized(
true);
1180 ManagedVar->takeName(Var);
1181 Var->setName(Twine(ManagedVar->getName()) +
".managed");
1184 if (CGM.
getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1185 assert(!ManagedVar->isDeclaration());
1196void CGNVCUDARuntime::createOffloadingEntries() {
1198 StringRef Section = (SectionPrefix +
"_offloading_entries").toStringRef(Out);
1201 for (KernelInfo &I : EmittedKernels)
1202 llvm::offloading::emitOffloadingEntry(
1203 M, KernelHandles[I.Kernel->getName()],
1204 getDeviceSideName(cast<NamedDecl>(I.D)), 0, 0,
1205 llvm::offloading::OffloadGlobalEntry, Section);
1207 for (VarInfo &I : DeviceVars) {
1209 CGM.
getDataLayout().getTypeAllocSize(I.Var->getValueType());
1212 ?
static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1214 (I.Flags.isConstant()
1215 ?
static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1217 (I.Flags.isNormalized()
1218 ?
static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1220 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1221 llvm::offloading::emitOffloadingEntry(
1222 M, I.Var, getDeviceSideName(I.D), VarSize,
1223 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1224 : llvm::offloading::OffloadGlobalEntry) |
1227 }
else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1228 llvm::offloading::emitOffloadingEntry(
1229 M, I.Var, getDeviceSideName(I.D), VarSize,
1230 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1231 I.Flags.getSurfTexType(), Section);
1232 }
else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1233 llvm::offloading::emitOffloadingEntry(
1234 M, I.Var, getDeviceSideName(I.D), VarSize,
1235 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1236 I.Flags.getSurfTexType(), Section);
1242llvm::Function *CGNVCUDARuntime::finalizeModule() {
1243 transformManagedVars();
1255 for (
auto &&Info : DeviceVars) {
1256 auto Kind = Info.Flags.getKind();
1257 if (!Info.Var->isDeclaration() &&
1258 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1259 (Kind == DeviceVarFlags::Variable ||
1260 Kind == DeviceVarFlags::Surface ||
1261 Kind == DeviceVarFlags::Texture) &&
1262 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1269 (CGM.
getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
1270 createOffloadingEntries();
1272 return makeModuleCtorFunction();
1277llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1279 auto Loc = KernelHandles.find(F->getName());
1280 if (
Loc != KernelHandles.end()) {
1281 auto OldHandle =
Loc->second;
1282 if (KernelStubs[OldHandle] == F)
1290 KernelStubs[OldHandle] = F;
1295 KernelStubs.erase(OldHandle);
1299 KernelHandles[F->getName()] = F;
1304 auto *Var =
new llvm::GlobalVariable(
1305 TheModule, F->getType(),
true, F->getLinkage(),
1310 Var->setDSOLocal(F->isDSOLocal());
1311 Var->setVisibility(F->getVisibility());
1312 auto *FD = cast<FunctionDecl>(GD.
getDecl());
1313 auto *FT = FD->getPrimaryTemplate();
1314 if (!FT || FT->isThisDeclarationADefinition())
1316 KernelHandles[F->getName()] = Var;
1317 KernelStubs[Var] = F;
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
TranslationUnitDecl * getTranslationUnitDecl() const
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
bool shouldExternalize(const Decl *D) const
Whether a C++ static variable or CUDA/HIP kernel should be externalized.
StringRef getCUIDHash() const
const TargetInfo * getAuxTargetInfo() const
llvm::DenseSet< const VarDecl * > CUDADeviceVarODRUsedByHost
Keep track of CUDA/HIP device-side variables ODR-used by host code.
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
TypeInfoChars getTypeInfoInChars(const Type *T) const
const TargetInfo & getTargetInfo() const
unsigned getTargetAddressSpace(LangAS AS) const
CharUnits - This is an opaque type for sizes expressed in character units.
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
static CharUnits One()
One - Construct a CharUnits quantity of one.
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
std::string CudaGpuBinaryFileName
Name of file passed with -fcuda-include-gpubinary option to forward to CUDA runtime back-end for inco...
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
llvm::PointerType * getType() const
Return the type of the pointer value.
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
llvm::StoreInst * CreateAlignedStore(llvm::Value *Val, llvm::Value *Addr, CharUnits Align, bool IsVolatile=false)
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
Address CreateStructGEP(Address Addr, unsigned Index, const llvm::Twine &Name="")
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
MangleContext & getMangleContext()
Gets the mangle context.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
void add(RValue rvalue, QualType type)
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
RawAddress CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits align, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc, bool IsVirtualFunctionPointerThunk=false)
EmitCall - Generate a call of the given function, expecting the given result type,...
llvm::Type * ConvertTypeForMem(QualType T)
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
llvm::FunctionCallee CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false, bool AssumeConvergent=false)
Create or return a runtime function declaration with the specified type and name.
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
const IntrusiveRefCntPtr< llvm::vfs::FileSystem > & getFileSystem() const
DiagnosticsEngine & getDiags() const
const LangOptions & getLangOpts() const
CodeGenTypes & getTypes()
const TargetInfo & getTarget() const
const llvm::DataLayout & getDataLayout() const
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
CGCXXABI & getCXXABI() const
const llvm::Triple & getTriple() const
ASTContext & getContext() const
const CodeGenOptions & getCodeGenOpts() const
StringRef getMangledName(GlobalDecl GD)
void maybeSetTrivialComdat(const Decl &D, llvm::GlobalObject &GO)
void printPostfixForExternalizedDecl(llvm::raw_ostream &OS, const Decl *D) const
Print the postfix for externalized static variable or kernels for single source offloading languages ...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
const CGFunctionInfo & arrangeFunctionDeclaration(const FunctionDecl *FD)
Free functions are functions that are compatible with an ordinary C function pointer type.
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
static RValue get(llvm::Value *V)
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Decl - This represents one declaration (or definition), e.g.
SourceLocation getLocation() const
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Represents a function declaration or definition.
const ParmVarDecl * getParamDecl(unsigned i) const
GlobalDecl - represents a global declaration.
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
const Decl * getDecl() const
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
bool shouldMangleDeclName(const NamedDecl *D)
void mangleName(GlobalDecl GD, raw_ostream &)
This represents a decl that may have a name.
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Represents a parameter to a function.
A (possibly-)qualified type.
QualType getCanonicalType() const
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
RecordDecl * getDecl() const
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
const llvm::VersionTuple & getSDKVersion() const
A template argument list.
unsigned size() const
Retrieve the number of template arguments in this template argument list.
The top declaration context.
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
The base class of the type hierarchy.
Represents a variable declaration or definition.
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
bool Zero(InterpState &S, CodePtr OpPC)
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
llvm::IntegerType * SizeTy
llvm::IntegerType * IntTy
int
CharUnits getSizeAlign() const
llvm::PointerType * UnqualPtrTy
CharUnits getPointerAlign() const