23#include "llvm/ADT/StringRef.h"
24#include "llvm/Frontend/Offloading/Utility.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/DerivedTypes.h"
28#include "llvm/IR/ReplaceConstant.h"
29#include "llvm/Support/Format.h"
30#include "llvm/Support/VirtualFileSystem.h"
36constexpr unsigned CudaFatMagic = 0x466243b1;
37constexpr unsigned HIPFatMagic = 0x48495046;
45 llvm::IntegerType *IntTy, *SizeTy;
47 llvm::PointerType *PtrTy;
50 llvm::LLVMContext &Context;
52 llvm::Module &TheModule;
55 llvm::Function *Kernel;
58 llvm::SmallVector<KernelInfo, 16> EmittedKernels;
62 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
64 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
66 llvm::GlobalVariable *Var;
70 llvm::SmallVector<VarInfo, 16> DeviceVars;
74 llvm::GlobalVariable *GpuBinaryHandle =
nullptr;
76 bool RelocatableDeviceCode;
78 std::unique_ptr<MangleContext> DeviceMC;
80 llvm::FunctionCallee getSetupArgumentFn()
const;
81 llvm::FunctionCallee getLaunchFn()
const;
83 llvm::FunctionType *getRegisterGlobalsFnTy()
const;
84 llvm::FunctionType *getCallbackFnTy()
const;
85 llvm::FunctionType *getRegisterLinkedBinaryFnTy()
const;
86 std::string addPrefixToName(StringRef FuncName)
const;
87 std::string addUnderscoredPrefixToName(StringRef FuncName)
const;
90 llvm::Function *makeRegisterGlobalsFn();
95 llvm::Constant *makeConstantString(
const std::string &Str,
96 const std::string &Name =
"") {
97 return CGM.GetAddrOfConstantCString(Str, Name).getPointer();
103 llvm::Constant *makeConstantArray(StringRef Str,
105 StringRef SectionName =
"",
106 unsigned Alignment = 0,
107 bool AddNull =
false) {
108 llvm::Constant *
Value =
109 llvm::ConstantDataArray::getString(Context, Str, AddNull);
110 auto *GV =
new llvm::GlobalVariable(
111 TheModule,
Value->getType(),
true,
112 llvm::GlobalValue::PrivateLinkage,
Value, Name);
113 if (!SectionName.empty()) {
114 GV->setSection(SectionName);
117 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
120 GV->setAlignment(llvm::Align(Alignment));
125 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
126 assert(FnTy->getReturnType()->isVoidTy() &&
127 "Can only generate dummy functions returning void!");
128 llvm::Function *DummyFunc = llvm::Function::Create(
129 FnTy, llvm::GlobalValue::InternalLinkage,
"dummy", &TheModule);
131 llvm::BasicBlock *DummyBlock =
132 llvm::BasicBlock::Create(Context,
"", DummyFunc);
133 CGBuilderTy FuncBuilder(CGM, Context);
134 FuncBuilder.SetInsertPoint(DummyBlock);
135 FuncBuilder.CreateRetVoid();
140 Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
141 Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
142 FunctionArgList &Args);
143 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
144 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
145 std::string getDeviceSideName(
const NamedDecl *ND)
override;
147 void registerDeviceVar(
const VarDecl *VD, llvm::GlobalVariable &Var,
149 DeviceVars.push_back({&Var,
151 {DeviceVarFlags::Variable, Extern,
Constant,
152 VD->hasAttr<HIPManagedAttr>(),
155 void registerDeviceSurf(
const VarDecl *VD, llvm::GlobalVariable &Var,
156 bool Extern,
int Type) {
157 DeviceVars.push_back({&Var,
159 {DeviceVarFlags::Surface, Extern,
false,
163 void registerDeviceTex(
const VarDecl *VD, llvm::GlobalVariable &Var,
164 bool Extern,
int Type,
bool Normalized) {
165 DeviceVars.push_back({&Var,
167 {DeviceVarFlags::Texture, Extern,
false,
168 false, Normalized,
Type}});
172 llvm::Function *makeModuleCtorFunction();
174 llvm::Function *makeModuleDtorFunction();
176 void transformManagedVars();
178 void createOffloadingEntries();
181 CGNVCUDARuntime(CodeGenModule &CGM);
183 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD)
override;
184 llvm::Function *getKernelStub(llvm::GlobalValue *Handle)
override {
185 auto Loc = KernelStubs.find(Handle);
186 assert(Loc != KernelStubs.end());
189 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)
override;
190 void handleVarRegistration(
const VarDecl *VD,
191 llvm::GlobalVariable &Var)
override;
193 internalizeDeviceSideVar(
const VarDecl *D,
194 llvm::GlobalValue::LinkageTypes &
Linkage)
override;
196 llvm::Function *finalizeModule()
override;
201std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName)
const {
202 return (Prefix + FuncName).str();
205CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName)
const {
206 return (
"__" + Prefix + FuncName).str();
216 return std::unique_ptr<MangleContext>(
225CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
226 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
227 TheModule(CGM.getModule()),
228 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
243llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn()
const {
245 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
247 llvm::FunctionType::get(IntTy, Params,
false),
248 addPrefixToName(
"SetupArgument"));
251llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn()
const {
255 llvm::FunctionType::get(IntTy, PtrTy,
false),
"hipLaunchByPtr");
262llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy()
const {
263 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
266llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy()
const {
267 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
270llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy()
const {
271 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
272 llvm::PointerType::getUnqual(Context)};
273 return llvm::FunctionType::get(VoidTy, Params,
false);
276std::string CGNVCUDARuntime::getDeviceSideName(
const NamedDecl *ND) {
279 if (
auto *FD = dyn_cast<FunctionDecl>(ND))
280 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
283 std::string DeviceSideName;
290 SmallString<256> Buffer;
291 llvm::raw_svector_ostream
Out(Buffer);
293 DeviceSideName = std::string(
Out.str());
300 SmallString<256> Buffer;
301 llvm::raw_svector_ostream
Out(Buffer);
302 Out << DeviceSideName;
304 DeviceSideName = std::string(
Out.str());
306 return DeviceSideName;
309void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
310 FunctionArgList &Args) {
313 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.
CurFn->getName()])) {
314 GV->setLinkage(CGF.
CurFn->getLinkage());
315 GV->setInitializer(CGF.
CurFn);
318 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
321 emitDeviceStubBodyNew(CGF, Args);
323 emitDeviceStubBodyLegacy(CGF, Args);
333Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
334 FunctionArgList &Args) {
335 SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
336 for (
auto &Arg : Args)
338 llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);
340 auto *Int64Ty = CGF.
Builder.getInt64Ty();
341 KernelLaunchParamsTypes.push_back(Int64Ty);
342 KernelLaunchParamsTypes.push_back(PtrTy);
343 KernelLaunchParamsTypes.push_back(PtrTy);
345 llvm::StructType *KernelLaunchParamsTy =
346 llvm::StructType::create(KernelLaunchParamsTypes);
351 "kernel_launch_params");
353 auto KernelArgsSize = CGM.
getDataLayout().getTypeAllocSize(KernelArgsTy);
361 for (
unsigned i = 0; i < Args.size(); ++i) {
366 return KernelLaunchParams;
369Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
370 FunctionArgList &Args) {
376 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
378 for (
unsigned i = 0; i < Args.size(); ++i) {
380 llvm::Value *VoidVarPtr = CGF.
Builder.CreatePointerCast(VarPtr, PtrTy);
382 VoidVarPtr, CGF.
Builder.CreateConstGEP1_32(
390void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
391 FunctionArgList &Args) {
394 ? prepareKernelArgsLLVMOffload(CGF, Args)
395 : prepareKernelArgs(CGF, Args);
409 TranslationUnitDecl *TUDecl = CGM.
getContext().getTranslationUnitDecl();
411 std::string KernelLaunchAPI =
"LaunchKernel";
413 LangOptions::GPUDefaultStreamKind::PerThread) {
415 KernelLaunchAPI = KernelLaunchAPI +
"_spt";
417 KernelLaunchAPI = KernelLaunchAPI +
"_ptsz";
419 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
420 const IdentifierInfo &cudaLaunchKernelII =
422 FunctionDecl *cudaLaunchKernelFD =
nullptr;
424 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(
Result))
425 cudaLaunchKernelFD = FD;
428 if (cudaLaunchKernelFD ==
nullptr) {
430 "Can't find declaration for " + LaunchKernelName);
434 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->
getParamDecl(1);
435 QualType Dim3Ty = GridDimParam->
getType();
445 llvm::FunctionType::get(IntTy,
451 addUnderscoredPrefixToName(
"PopCallConfiguration"));
460 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
461 CallArgList LaunchKernelArgs;
473 QualType QT = cudaLaunchKernelFD->
getType();
478 const CGFunctionInfo &FI =
480 llvm::FunctionCallee cudaLaunchKernelFn =
490 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(
Kernel);
491 std::string GlobalVarName = (KernelFunction->getName() +
".id").str();
493 llvm::GlobalVariable *HandleVar =
494 CGM.
getModule().getNamedGlobal(GlobalVarName);
496 HandleVar =
new llvm::GlobalVariable(
498 false, KernelFunction->getLinkage(),
499 llvm::ConstantInt::get(CGM.
Int8Ty, 0), GlobalVarName);
500 HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
501 HandleVar->setVisibility(KernelFunction->getVisibility());
502 if (KernelFunction->hasComdat())
503 HandleVar->setComdat(CGM.
getModule().getOrInsertComdat(GlobalVarName));
516void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
517 FunctionArgList &Args) {
519 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
522 for (
const VarDecl *A : Args) {
524 Offset = Offset.
alignTo(TInfo.Align);
525 llvm::Value *Args[] = {
528 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
529 llvm::ConstantInt::get(SizeTy, Offset.
getQuantity()),
532 llvm::Constant *
Zero = llvm::ConstantInt::get(IntTy, 0);
533 llvm::Value *CBZero = CGF.
Builder.CreateICmpEQ(CB,
Zero);
535 CGF.
Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
537 Offset += TInfo.Width;
541 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
543 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
553 llvm::GlobalVariable *ManagedVar) {
555 for (
auto &&VarUse : Var->uses()) {
556 WorkList.push_back({VarUse.getUser()});
558 while (!WorkList.empty()) {
559 auto &&WorkItem = WorkList.pop_back_val();
560 auto *
U = WorkItem.back();
562 for (
auto &&UU :
U->uses()) {
563 WorkItem.push_back(UU.getUser());
564 WorkList.push_back(WorkItem);
569 if (
auto *I = dyn_cast<llvm::Instruction>(
U)) {
570 llvm::Value *OldV = Var;
571 llvm::Instruction *NewV =
new llvm::LoadInst(
572 Var->getType(), ManagedVar,
"ld.managed",
false,
573 llvm::Align(Var->getAlignment()), I->getIterator());
577 for (
auto &&Op : WorkItem) {
579 auto *NewInst = CE->getAsInstruction();
580 NewInst->insertBefore(*I->getParent(), I->getIterator());
581 NewInst->replaceUsesOfWith(OldV, NewV);
585 I->replaceUsesOfWith(OldV, NewV);
587 llvm_unreachable(
"Invalid use of managed variable");
606llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
608 if (EmittedKernels.empty() && DeviceVars.empty())
611 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
612 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
613 addUnderscoredPrefixToName(
"_register_globals"), &TheModule);
614 llvm::BasicBlock *EntryBB =
615 llvm::BasicBlock::Create(Context,
"entry", RegisterKernelsFunc);
616 CGBuilderTy Builder(CGM, Context);
617 Builder.SetInsertPoint(EntryBB);
621 llvm::Type *RegisterFuncParams[] = {
622 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
623 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
625 llvm::FunctionType::get(IntTy, RegisterFuncParams,
false),
626 addUnderscoredPrefixToName(
"RegisterFunction"));
631 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
632 for (
auto &&I : EmittedKernels) {
633 llvm::Constant *KernelName =
635 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
636 llvm::Value *Args[] = {
638 KernelHandles[I.Kernel->getName()],
641 llvm::ConstantInt::getAllOnesValue(IntTy),
646 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
647 Builder.CreateCall(RegisterFunc, Args);
650 llvm::Type *VarSizeTy = IntTy;
658 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
659 IntTy, VarSizeTy, IntTy, IntTy};
661 llvm::FunctionType::get(VoidTy, RegisterVarParams,
false),
662 addUnderscoredPrefixToName(
"RegisterVar"));
665 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
666 PtrTy, VarSizeTy, IntTy};
668 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams,
false),
669 addUnderscoredPrefixToName(
"RegisterManagedVar"));
673 llvm::FunctionType::get(
674 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy},
false),
675 addUnderscoredPrefixToName(
"RegisterSurface"));
679 llvm::FunctionType::get(
680 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy},
false),
681 addUnderscoredPrefixToName(
"RegisterTexture"));
682 for (
auto &&Info : DeviceVars) {
683 llvm::GlobalVariable *Var = Info.Var;
684 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
685 "External variables should not show up here, except HIP managed "
687 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
688 switch (Info.Flags.getKind()) {
689 case DeviceVarFlags::Variable: {
692 if (Info.Flags.isManaged()) {
693 assert(Var->getName().ends_with(
".managed") &&
694 "HIP managed variables not transformed");
695 auto *ManagedVar = CGM.
getModule().getNamedGlobal(
696 Var->getName().drop_back(StringRef(
".managed").size()));
697 llvm::Value *Args[] = {
702 llvm::ConstantInt::get(VarSizeTy, VarSize),
703 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
704 if (!Var->isDeclaration())
705 Builder.CreateCall(RegisterManagedVar, Args);
707 llvm::Value *Args[] = {
712 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
713 llvm::ConstantInt::get(VarSizeTy, VarSize),
714 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
715 llvm::ConstantInt::get(IntTy, 0)};
716 Builder.CreateCall(RegisterVar, Args);
720 case DeviceVarFlags::Surface:
723 {&GpuBinaryHandlePtr, Var, VarName, VarName,
724 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
725 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
727 case DeviceVarFlags::Texture:
730 {&GpuBinaryHandlePtr, Var, VarName, VarName,
731 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
732 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
733 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
738 Builder.CreateRetVoid();
739 return RegisterKernelsFunc;
761llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
766 if (CudaGpuBinaryFileName.empty() && !IsHIP)
768 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
773 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
776 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
777 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
781 llvm::FunctionType::get(PtrTy, PtrTy,
false),
782 addUnderscoredPrefixToName(
"RegisterFatBinary"));
784 llvm::StructType *FatbinWrapperTy =
785 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
791 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary =
nullptr;
792 if (!CudaGpuBinaryFileName.empty()) {
794 auto CudaGpuBinaryOrErr =
795 VFS->getBufferForFile(CudaGpuBinaryFileName, -1,
false);
796 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
798 << CudaGpuBinaryFileName << EC.message();
801 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
804 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
805 llvm::FunctionType::get(VoidTy,
false),
806 llvm::GlobalValue::InternalLinkage,
807 addUnderscoredPrefixToName(
"_module_ctor"), &TheModule);
808 llvm::BasicBlock *CtorEntryBB =
809 llvm::BasicBlock::Create(Context,
"entry", ModuleCtorFunc);
810 CGBuilderTy CtorBuilder(CGM, Context);
812 CtorBuilder.SetInsertPoint(CtorEntryBB);
814 const char *FatbinConstantName;
815 const char *FatbinSectionName;
816 const char *ModuleIDSectionName;
817 StringRef ModuleIDPrefix;
818 llvm::Constant *FatBinStr;
823 CGM.
getTriple().isMacOSX() ?
"__HIP,__hip_fatbin" :
".hip_fatbin";
825 CGM.
getTriple().isMacOSX() ?
"__HIP,__fatbin" :
".hipFatBinSegment";
827 ModuleIDSectionName =
828 CGM.
getTriple().isMacOSX() ?
"__HIP,__module_id" :
"__hip_module_id";
829 ModuleIDPrefix =
"__hip_";
834 const unsigned HIPCodeObjectAlign = 4096;
835 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
836 FatbinConstantName, HIPCodeObjectAlign);
842 FatBinStr =
new llvm::GlobalVariable(
844 true, llvm::GlobalValue::ExternalLinkage,
nullptr,
848 nullptr, llvm::GlobalVariable::NotThreadLocal);
852 FatMagic = HIPFatMagic;
854 if (RelocatableDeviceCode)
855 FatbinConstantName = CGM.
getTriple().isMacOSX()
856 ?
"__NV_CUDA,__nv_relfatbin"
860 CGM.
getTriple().isMacOSX() ?
"__NV_CUDA,__nv_fatbin" :
".nv_fatbin";
863 CGM.
getTriple().isMacOSX() ?
"__NV_CUDA,__fatbin" :
".nvFatBinSegment";
865 ModuleIDSectionName = CGM.
getTriple().isMacOSX()
866 ?
"__NV_CUDA,__nv_module_id"
868 ModuleIDPrefix =
"__nv_";
872 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
873 FatbinConstantName, 8);
874 FatMagic = CudaFatMagic;
878 ConstantInitBuilder Builder(CGM);
879 auto Values = Builder.beginStruct(FatbinWrapperTy);
881 Values.addInt(IntTy, FatMagic);
883 Values.addInt(IntTy, 1);
885 Values.add(FatBinStr);
887 Values.add(llvm::ConstantPointerNull::get(PtrTy));
888 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
891 FatbinWrapper->setSection(FatbinSectionName);
902 auto Linkage = RelocatableDeviceCode ? llvm::GlobalValue::ExternalLinkage
903 : llvm::GlobalValue::InternalLinkage;
904 llvm::BasicBlock *IfBlock =
905 llvm::BasicBlock::Create(Context,
"if", ModuleCtorFunc);
906 llvm::BasicBlock *ExitBlock =
907 llvm::BasicBlock::Create(Context,
"exit", ModuleCtorFunc);
910 GpuBinaryHandle =
new llvm::GlobalVariable(
911 TheModule, PtrTy,
false,
Linkage,
913 !RelocatableDeviceCode ? llvm::ConstantPointerNull::get(PtrTy)
920 if (
Linkage != llvm::GlobalValue::InternalLinkage)
921 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
923 GpuBinaryHandle, PtrTy,
926 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
927 llvm::Constant *
Zero =
928 llvm::Constant::getNullValue(HandleValue->getType());
929 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue,
Zero);
930 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
933 CtorBuilder.SetInsertPoint(IfBlock);
935 llvm::CallInst *RegisterFatbinCall =
936 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
937 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
938 CtorBuilder.CreateBr(ExitBlock);
941 CtorBuilder.SetInsertPoint(ExitBlock);
943 if (RegisterGlobalsFunc) {
944 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
945 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
948 }
else if (!RelocatableDeviceCode) {
952 llvm::CallInst *RegisterFatbinCall =
953 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
954 GpuBinaryHandle =
new llvm::GlobalVariable(
955 TheModule, PtrTy,
false, llvm::GlobalValue::InternalLinkage,
956 llvm::ConstantPointerNull::get(PtrTy),
"__cuda_gpubin_handle");
958 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
962 if (RegisterGlobalsFunc)
963 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
967 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
970 llvm::FunctionType::get(VoidTy, PtrTy,
false),
971 "__cudaRegisterFatBinaryEnd");
972 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
976 SmallString<64> ModuleID;
977 llvm::raw_svector_ostream
OS(ModuleID);
978 OS << ModuleIDPrefix << llvm::format(
"%" PRIx64, FatbinWrapper->getGUID());
979 llvm::Constant *ModuleIDConstant = makeConstantArray(
980 std::string(ModuleID),
"", ModuleIDSectionName, 32,
true);
983 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
984 Twine(
"__fatbinwrap") + ModuleID, FatbinWrapper);
988 SmallString<128> RegisterLinkedBinaryName(
"__cudaRegisterLinkedBinary");
989 RegisterLinkedBinaryName += ModuleID;
991 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
993 assert(RegisterGlobalsFunc &&
"Expecting at least dummy function!");
994 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
995 makeDummyFunction(getCallbackFnTy())};
996 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
1002 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
1004 llvm::FunctionType *AtExitTy =
1005 llvm::FunctionType::get(IntTy, CleanupFn->getType(),
false);
1006 llvm::FunctionCallee AtExitFunc =
1009 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
1012 CtorBuilder.CreateRetVoid();
1013 return ModuleCtorFunc;
1035llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
1037 if (!GpuBinaryHandle)
1042 llvm::FunctionType::get(VoidTy, PtrTy,
false),
1043 addUnderscoredPrefixToName(
"UnregisterFatBinary"));
1045 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
1046 llvm::FunctionType::get(VoidTy,
false),
1047 llvm::GlobalValue::InternalLinkage,
1048 addUnderscoredPrefixToName(
"_module_dtor"), &TheModule);
1050 llvm::BasicBlock *DtorEntryBB =
1051 llvm::BasicBlock::Create(Context,
"entry", ModuleDtorFunc);
1052 CGBuilderTy DtorBuilder(CGM, Context);
1053 DtorBuilder.SetInsertPoint(DtorEntryBB);
1056 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
1058 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
1063 llvm::BasicBlock *IfBlock =
1064 llvm::BasicBlock::Create(Context,
"if", ModuleDtorFunc);
1065 llvm::BasicBlock *ExitBlock =
1066 llvm::BasicBlock::Create(Context,
"exit", ModuleDtorFunc);
1067 llvm::Constant *
Zero = llvm::Constant::getNullValue(HandleValue->getType());
1068 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue,
Zero);
1069 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
1071 DtorBuilder.SetInsertPoint(IfBlock);
1072 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1073 DtorBuilder.CreateStore(
Zero, GpuBinaryAddr);
1074 DtorBuilder.CreateBr(ExitBlock);
1076 DtorBuilder.SetInsertPoint(ExitBlock);
1078 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1080 DtorBuilder.CreateRetVoid();
1081 return ModuleDtorFunc;
1085 return new CGNVCUDARuntime(CGM);
1088void CGNVCUDARuntime::internalizeDeviceSideVar(
1105 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>() ||
1106 D->
hasAttr<CUDASharedAttr>() ||
1109 Linkage = llvm::GlobalValue::InternalLinkage;
1113void CGNVCUDARuntime::handleVarRegistration(
const VarDecl *D,
1114 llvm::GlobalVariable &GV) {
1115 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>()) {
1131 D->
hasAttr<HIPManagedAttr>()) {
1133 D->
hasAttr<CUDAConstantAttr>());
1141 const TemplateArgumentList &Args = TD->getTemplateArgs();
1142 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1143 assert(Args.
size() == 2 &&
1144 "Unexpected number of template arguments of CUDA device "
1145 "builtin surface type.");
1146 auto SurfType = Args[1].getAsIntegral();
1148 registerDeviceSurf(D, GV, !D->
hasDefinition(), SurfType.getSExtValue());
1150 assert(Args.
size() == 3 &&
1151 "Unexpected number of template arguments of CUDA device "
1152 "builtin texture type.");
1153 auto TexType = Args[1].getAsIntegral();
1154 auto Normalized = Args[2].getAsIntegral();
1156 registerDeviceTex(D, GV, !D->
hasDefinition(), TexType.getSExtValue(),
1157 Normalized.getZExtValue());
1166void CGNVCUDARuntime::transformManagedVars() {
1167 for (
auto &&Info : DeviceVars) {
1168 llvm::GlobalVariable *Var = Info.Var;
1169 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1170 Info.Flags.isManaged()) {
1171 auto *ManagedVar =
new llvm::GlobalVariable(
1173 false, Var->getLinkage(),
1174 Var->isDeclaration()
1176 : llvm::ConstantPointerNull::get(Var->getType()),
1178 llvm::GlobalVariable::NotThreadLocal,
1180 ? LangAS::cuda_device
1181 : LangAS::Default));
1182 ManagedVar->setDSOLocal(Var->isDSOLocal());
1183 ManagedVar->setVisibility(Var->getVisibility());
1184 ManagedVar->setExternallyInitialized(
true);
1186 ManagedVar->takeName(Var);
1187 Var->setName(Twine(ManagedVar->getName()) +
".managed");
1190 if (CGM.
getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1191 assert(!ManagedVar->isDeclaration());
1202void CGNVCUDARuntime::createOffloadingEntries() {
1204 ? llvm::object::OffloadKind::OFK_HIP
1205 : llvm::object::OffloadKind::OFK_Cuda;
1208 Kind = llvm::object::OffloadKind::OFK_OpenMP;
1211 for (KernelInfo &I : EmittedKernels)
1212 llvm::offloading::emitOffloadingEntry(
1213 M, Kind, KernelHandles[I.Kernel->getName()],
1215 llvm::offloading::OffloadGlobalEntry);
1217 for (VarInfo &I : DeviceVars) {
1219 CGM.
getDataLayout().getTypeAllocSize(I.Var->getValueType());
1222 ?
static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1224 (I.Flags.isConstant()
1225 ?
static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1227 (I.Flags.isNormalized()
1228 ?
static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1230 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1231 if (I.Flags.isManaged()) {
1232 assert(I.Var->getName().ends_with(
".managed") &&
1233 "HIP managed variables not transformed");
1235 auto *ManagedVar = M.getNamedGlobal(
1236 I.Var->getName().drop_back(StringRef(
".managed").size()));
1237 llvm::offloading::emitOffloadingEntry(
1238 M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1239 llvm::offloading::OffloadGlobalManagedEntry | Flags,
1240 I.Var->getAlignment(), ManagedVar);
1242 llvm::offloading::emitOffloadingEntry(
1243 M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1244 llvm::offloading::OffloadGlobalEntry | Flags,
1247 }
else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1248 llvm::offloading::emitOffloadingEntry(
1249 M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1250 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1251 I.Flags.getSurfTexType());
1252 }
else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1253 llvm::offloading::emitOffloadingEntry(
1254 M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1255 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1256 I.Flags.getSurfTexType());
1262llvm::Function *CGNVCUDARuntime::finalizeModule() {
1263 transformManagedVars();
1275 for (
auto &&Info : DeviceVars) {
1276 auto Kind = Info.Flags.getKind();
1277 if (!Info.Var->isDeclaration() &&
1278 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1279 (Kind == DeviceVarFlags::Variable ||
1280 Kind == DeviceVarFlags::Surface ||
1281 Kind == DeviceVarFlags::Texture) &&
1282 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1289 (CGM.
getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
1290 createOffloadingEntries();
1292 return makeModuleCtorFunction();
1297llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1299 auto Loc = KernelHandles.find(F->getName());
1300 if (Loc != KernelHandles.end()) {
1301 auto OldHandle = Loc->second;
1302 if (KernelStubs[OldHandle] == F)
1310 KernelStubs[OldHandle] = F;
1315 KernelStubs.erase(OldHandle);
1319 KernelHandles[F->getName()] = F;
1324 auto *Var =
new llvm::GlobalVariable(
1325 TheModule, F->getType(),
true, F->getLinkage(),
1330 Var->setDSOLocal(F->isDSOLocal());
1331 Var->setVisibility(F->getVisibility());
1333 auto *FT = FD->getPrimaryTemplate();
1334 if (!FT || FT->isThisDeclarationADefinition())
1336 KernelHandles[F->getName()] = Var;
1337 KernelStubs[Var] = F;
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
Result
Implement __builtin_bit_cast and related operations.
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
bool shouldExternalize(const Decl *D) const
Whether a C++ static variable or CUDA/HIP kernel should be externalized.
StringRef getCUIDHash() const
llvm::SetVector< const VarDecl * > CUDADeviceVarODRUsedByHost
Keep track of CUDA/HIP device-side variables ODR-used by host code.
const TargetInfo * getAuxTargetInfo() const
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
TypeInfoChars getTypeInfoInChars(const Type *T) const
const TargetInfo & getTargetInfo() const
unsigned getTargetAddressSpace(LangAS AS) const
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
static CharUnits One()
One - Construct a CharUnits quantity of one.
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
CharUnits alignTo(const CharUnits &Align) const
alignTo - Returns the next integer (mod 2**64) that is greater than or equal to this quantity and is ...
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
std::string CudaGpuBinaryFileName
Name of file passed with -fcuda-include-gpubinary option to forward to CUDA runtime back-end for inco...
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
llvm::PointerType * getType() const
Return the type of the pointer value.
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
llvm::StoreInst * CreateAlignedStore(llvm::Value *Val, llvm::Value *Addr, CharUnits Align, bool IsVolatile=false)
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
Address CreateStructGEP(Address Addr, unsigned Index, const llvm::Twine &Name="")
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
MangleContext & getMangleContext()
Gets the mangle context.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
void add(RValue rvalue, QualType type)
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Emits a call or invoke instruction to the given runtime function.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **CallOrInvoke, bool IsMustTail, SourceLocation Loc, bool IsVirtualFunctionPointerThunk=false)
EmitCall - Generate a call of the given function, expecting the given result type,...
RawAddress CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits align, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates a alloca and inserts it into the entry block.
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::Type * ConvertTypeForMem(QualType T)
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
llvm::FunctionCallee CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false, bool AssumeConvergent=false)
Create or return a runtime function declaration with the specified type and name.
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
const IntrusiveRefCntPtr< llvm::vfs::FileSystem > & getFileSystem() const
DiagnosticsEngine & getDiags() const
const LangOptions & getLangOpts() const
CodeGenTypes & getTypes()
const TargetInfo & getTarget() const
const llvm::DataLayout & getDataLayout() const
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
CGCXXABI & getCXXABI() const
SanitizerMetadata * getSanitizerMetadata()
const llvm::Triple & getTriple() const
ASTContext & getContext() const
const CodeGenOptions & getCodeGenOpts() const
StringRef getMangledName(GlobalDecl GD)
void maybeSetTrivialComdat(const Decl &D, llvm::GlobalObject &GO)
void printPostfixForExternalizedDecl(llvm::raw_ostream &OS, const Decl *D) const
Print the postfix for externalized static variable or kernels for single source offloading languages ...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
const CGFunctionInfo & arrangeFunctionDeclaration(const GlobalDecl GD)
Free functions are functions that are compatible with an ordinary C function pointer type.
static RValue get(llvm::Value *V)
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
SourceLocation getLocation() const
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
const ParmVarDecl * getParamDecl(unsigned i) const
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
const Decl * getDecl() const
StringRef getName() const
Return the actual identifier string.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
std::string CUID
The user provided compilation unit ID, if non-empty.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
bool shouldMangleDeclName(const NamedDecl *D)
void mangleName(GlobalDecl GD, raw_ostream &)
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
QualType getCanonicalType() const
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
const llvm::VersionTuple & getSDKVersion() const
unsigned size() const
Retrieve the number of template arguments in this template argument list.
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
CXXRecordDecl * castAsCXXRecordDecl() const
bool isCUDADeviceBuiltinSurfaceType() const
Check if the type is the CUDA device builtin surface type.
bool isCUDADeviceBuiltinTextureType() const
Check if the type is the CUDA device builtin texture type.
Represents a variable declaration or definition.
bool isInline() const
Whether this variable is (C++1z) inline.
bool hasExternalStorage() const
Returns true if a variable has extern or private_extern storage.
DefinitionKind hasDefinition(ASTContext &) const
Check whether this variable is defined in this translation unit.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
@ VFS
Remove unused -ivfsoverlay arguments.
@ OS
Indicates that the tracking object is a descendant of a referenced-counted OSObject,...
@ Address
A pointer to a ValueDecl.
The JSON file list parser is used to communicate input to InstallAPI.
bool isa(CodeGen::Address addr)
if(T->getSizeExpr()) TRY_TO(TraverseStmt(const_cast< Expr * >(T -> getSizeExpr())))
CudaVersion ToCudaVersion(llvm::VersionTuple)
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
@ Type
The name was classified as a type.
U cast(CodeGen::Address addr)
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
llvm::IntegerType * SizeTy
llvm::IntegerType * IntTy
int
CharUnits getSizeAlign() const
CharUnits getPointerAlign() const
llvm::PointerType * DefaultPtrTy