10#include "TargetInfo.h"
24 static const unsigned MaxNumRegsForArgsRet = 16;
26 unsigned numRegsForType(
QualType Ty)
const;
30 uint64_t Members)
const override;
33 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty,
unsigned FromAS,
34 unsigned ToAS)
const {
36 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
37 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38 return llvm::PointerType::get(Ty->getContext(), ToAS);
55bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(
QualType Ty)
const {
59bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
60 const Type *
Base, uint64_t Members)
const {
61 uint32_t NumRegs = (getContext().getTypeSize(
Base) + 31) / 32;
64 return Members * NumRegs <= MaxNumRegsForArgsRet;
68unsigned AMDGPUABIInfo::numRegsForType(
QualType Ty)
const {
74 QualType EltTy = VT->getElementType();
75 unsigned EltSize = getContext().getTypeSize(EltTy);
79 return (VT->getNumElements() + 1) / 2;
81 unsigned EltNumRegs = (EltSize + 31) / 32;
82 return EltNumRegs * VT->getNumElements();
91 NumRegs += numRegsForType(FieldTy);
97 return (getContext().getTypeSize(Ty) + 31) / 32;
106 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
108 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
109 Arg.info = classifyKernelArgumentType(Arg.type);
118 llvm_unreachable(
"AMDGPU does not support varargs");
149 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
153 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
172 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
173 llvm::Type *LTy = OrigLTy;
174 if (getContext().getLangOpts().
HIP) {
175 LTy = coerceKernelArgumentType(
176 OrigLTy, getContext().getTargetAddressSpace(LangAS::Default),
177 getContext().getTargetAddressSpace(LangAS::cuda_device));
186 if (!getContext().getLangOpts().
OpenCL && LTy == OrigLTy &&
189 getContext().getTypeAlignInChars(Ty),
190 getContext().getTargetAddressSpace(LangAS::opencl_constant),
201 unsigned &NumRegsLeft)
const {
202 assert(NumRegsLeft <= MaxNumRegsForArgsRet &&
"register estimate underflow");
231 unsigned NumRegs = (
Size + 31) / 32;
232 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
241 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
245 if (NumRegsLeft > 0) {
246 unsigned NumRegs = numRegsForType(Ty);
247 if (NumRegsLeft >= NumRegs) {
248 NumRegsLeft -= NumRegs;
256 getContext().getTypeAlignInChars(Ty),
257 getContext().getTargetAddressSpace(LangAS::opencl_private));
263 unsigned NumRegs = numRegsForType(Ty);
264 NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
275 void setFunctionDeclAttributes(
const FunctionDecl *FD, llvm::Function *F,
285 llvm::PointerType *T,
QualType QT)
const override;
289 getABIInfo().getDataLayout().getAllocaAddrSpace());
292 const VarDecl *D)
const override;
295 llvm::AtomicOrdering Ordering,
296 llvm::LLVMContext &Ctx)
const override;
298 llvm::Function *BlockInvokeFunc,
299 llvm::Type *BlockTy)
const override;
307 llvm::GlobalValue *GV) {
308 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
311 return !D->
hasAttr<OMPDeclareTargetDeclAttr>() &&
312 (D->
hasAttr<OpenCLKernelAttr>() ||
313 (isa<FunctionDecl>(D) && D->
hasAttr<CUDAGlobalAttr>()) ||
315 (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>() ||
316 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
317 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
320void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
322 const auto *ReqdWGS =
324 const bool IsOpenCLKernel =
328 const auto *FlatWGS = FD->
getAttr<AMDGPUFlatWorkGroupSizeAttr>();
329 if (ReqdWGS || FlatWGS) {
331 }
else if (IsOpenCLKernel || IsHIPKernel) {
334 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
335 const unsigned DefaultMaxWorkGroupSize =
336 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
338 std::string AttrVal =
339 std::string(
"1,") + llvm::utostr(DefaultMaxWorkGroupSize);
340 F->addFnAttr(
"amdgpu-flat-work-group-size", AttrVal);
343 if (
const auto *
Attr = FD->
getAttr<AMDGPUWavesPerEUAttr>())
346 if (
const auto *
Attr = FD->
getAttr<AMDGPUNumSGPRAttr>()) {
347 unsigned NumSGPR =
Attr->getNumSGPR();
350 F->addFnAttr(
"amdgpu-num-sgpr", llvm::utostr(NumSGPR));
353 if (
const auto *
Attr = FD->
getAttr<AMDGPUNumVGPRAttr>()) {
354 uint32_t NumVGPR =
Attr->getNumVGPR();
357 F->addFnAttr(
"amdgpu-num-vgpr", llvm::utostr(NumVGPR));
363void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
365 StringRef Name =
"__oclc_ABI_version";
366 llvm::GlobalVariable *OriginalGV = CGM.
getModule().getNamedGlobal(Name);
367 if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
371 llvm::CodeObjectVersionKind::COV_None)
374 auto *
Type = llvm::IntegerType::getIntNTy(CGM.
getModule().getContext(), 32);
375 llvm::Constant *COV = llvm::ConstantInt::get(
380 auto *GV =
new llvm::GlobalVariable(
381 CGM.
getModule(),
Type,
true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
382 nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
384 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
385 GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
389 OriginalGV->replaceAllUsesWith(GV);
390 GV->takeName(OriginalGV);
391 OriginalGV->eraseFromParent();
395void AMDGPUTargetCodeGenInfo::setTargetAttributes(
398 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
399 GV->setDSOLocal(
true);
402 if (GV->isDeclaration())
405 llvm::Function *F = dyn_cast<llvm::Function>(GV);
409 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
411 setFunctionDeclAttributes(FD, F, M);
414 F->addFnAttr(
"amdgpu-unsafe-fp-atomics",
"true");
416 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
417 F->addFnAttr(
"amdgpu-ieee",
"false");
420unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv()
const {
421 return llvm::CallingConv::AMDGPU_KERNEL;
429llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
433 return llvm::ConstantPointerNull::get(PT);
436 auto NPT = llvm::PointerType::get(
437 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
438 return llvm::ConstantExpr::getAddrSpaceCast(
439 llvm::ConstantPointerNull::get(NPT), PT);
443AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(
CodeGenModule &CGM,
447 "Address space agnostic languages only");
451 return DefaultGlobalAS;
454 if (AddrSpace != LangAS::Default)
463 return DefaultGlobalAS;
467AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(
const LangOptions &LangOpts,
469 llvm::AtomicOrdering Ordering,
470 llvm::LLVMContext &Ctx)
const {
473 case SyncScope::HIPSingleThread:
474 Name =
"singlethread";
476 case SyncScope::HIPWavefront:
477 case SyncScope::OpenCLSubGroup:
480 case SyncScope::HIPWorkgroup:
481 case SyncScope::OpenCLWorkGroup:
484 case SyncScope::HIPAgent:
485 case SyncScope::OpenCLDevice:
488 case SyncScope::HIPSystem:
489 case SyncScope::OpenCLAllSVMDevices:
494 if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
496 Name = Twine(Twine(Name) + Twine(
"-")).str();
498 Name = Twine(Twine(Name) + Twine(
"one-as")).str();
501 return Ctx.getOrInsertSyncScopeID(Name);
504bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases()
const {
508bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators()
const {
512void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
514 FT = getABIInfo().getContext().adjustFunctionType(
526llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
527 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy)
const {
531 auto *InvokeFT = Invoke->getFunctionType();
540 ArgTys.push_back(BlockTy);
541 ArgTypeNames.push_back(llvm::MDString::get(
C,
"__block_literal"));
542 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
543 ArgBaseTypeNames.push_back(llvm::MDString::get(
C,
"__block_literal"));
544 ArgTypeQuals.push_back(llvm::MDString::get(
C,
""));
545 AccessQuals.push_back(llvm::MDString::get(
C,
"none"));
546 ArgNames.push_back(llvm::MDString::get(
C,
"block_literal"));
547 for (
unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
548 ArgTys.push_back(InvokeFT->getParamType(I));
549 ArgTypeNames.push_back(llvm::MDString::get(
C,
"void*"));
550 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
551 AccessQuals.push_back(llvm::MDString::get(
C,
"none"));
552 ArgBaseTypeNames.push_back(llvm::MDString::get(
C,
"void*"));
553 ArgTypeQuals.push_back(llvm::MDString::get(
C,
""));
555 llvm::MDString::get(
C, (Twine(
"local_arg") + Twine(I)).str()));
557 std::string Name = Invoke->getName().str() +
"_kernel";
558 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(
C), ArgTys,
false);
559 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
561 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
563 llvm::AttrBuilder KernelAttrs(
C);
567 KernelAttrs.addAttribute(
"enqueued-block");
568 F->addFnAttrs(KernelAttrs);
570 auto IP = CGF.
Builder.saveIP();
571 auto *BB = llvm::BasicBlock::Create(
C,
"entry", F);
572 Builder.SetInsertPoint(BB);
574 auto *BlockPtr = Builder.CreateAlloca(BlockTy,
nullptr);
575 BlockPtr->setAlignment(BlockAlign);
576 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
577 auto *
Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
579 Args.push_back(Cast);
580 for (llvm::Argument &A : llvm::drop_begin(F->args()))
582 llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
583 call->setCallingConv(Invoke->getCallingConv());
584 Builder.CreateRetVoid();
585 Builder.restoreIP(IP);
587 F->setMetadata(
"kernel_arg_addr_space", llvm::MDNode::get(
C, AddressQuals));
588 F->setMetadata(
"kernel_arg_access_qual", llvm::MDNode::get(
C, AccessQuals));
589 F->setMetadata(
"kernel_arg_type", llvm::MDNode::get(
C, ArgTypeNames));
590 F->setMetadata(
"kernel_arg_base_type",
591 llvm::MDNode::get(
C, ArgBaseTypeNames));
592 F->setMetadata(
"kernel_arg_type_qual", llvm::MDNode::get(
C, ArgTypeQuals));
594 F->setMetadata(
"kernel_arg_name", llvm::MDNode::get(
C, ArgNames));
600 llvm::Function *F,
const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
601 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
602 int32_t *MaxThreadsVal) {
606 Min = FlatWGS->getMin()->EvaluateKnownConstInt(
getContext()).getExtValue();
607 Max = FlatWGS->getMax()->EvaluateKnownConstInt(
getContext()).getExtValue();
609 if (ReqdWGS && Min == 0 && Max == 0)
610 Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
613 assert(Min <= Max &&
"Min must be less than or equal Max");
616 *MinThreadsVal = Min;
618 *MaxThreadsVal = Max;
619 std::string AttrVal = llvm::utostr(Min) +
"," + llvm::utostr(Max);
621 F->addFnAttr(
"amdgpu-flat-work-group-size", AttrVal);
623 assert(Max == 0 &&
"Max must be zero");
627 llvm::Function *F,
const AMDGPUWavesPerEUAttr *
Attr) {
629 Attr->getMin()->EvaluateKnownConstInt(
getContext()).getExtValue();
632 ?
Attr->getMax()->EvaluateKnownConstInt(
getContext()).getExtValue()
636 assert((Max == 0 || Min <= Max) &&
"Min must be less than or equal Max");
638 std::string AttrVal = llvm::utostr(Min);
640 AttrVal = AttrVal +
"," + llvm::utostr(Max);
641 F->addFnAttr(
"amdgpu-waves-per-eu", AttrVal);
643 assert(Max == 0 &&
"Max must be zero");
646std::unique_ptr<TargetCodeGenInfo>
648 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.
getTypes());
static bool requiresAMDGPUProtectedVisibility(const Decl *D, llvm::GlobalValue *GV)
Defines the clang::TargetOptions class.
uint64_t getTargetNullPointerValue(QualType QT) const
Get target-dependent integer value for null pointer which is used for constant folding.
const TargetInfo & getTargetInfo() const
unsigned getTargetAddressSpace(LangAS AS) const
Attr - This represents one attribute.
ABIArgInfo - Helper class to encapsulate information about how a specific C type should be passed to ...
static ABIArgInfo getIgnore()
static ABIArgInfo getDirect(llvm::Type *T=nullptr, unsigned Offset=0, llvm::Type *Padding=nullptr, bool CanBeFlattened=true, unsigned Align=0)
static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace, bool Realign=false, llvm::Type *Padding=nullptr)
Pass this in memory using the IR byref attribute.
virtual bool isHomogeneousAggregateBaseType(QualType Ty) const
virtual bool isHomogeneousAggregateSmallEnough(const Type *Base, uint64_t Members) const
@ RAA_DirectInMemory
Pass it on the stack using its defined layout.
CGFunctionInfo - Class to encapsulate the information about a function definition.
ABIArgInfo & getReturnInfo()
unsigned getCallingConvention() const
getCallingConvention - Return the user specified calling convention, which has been translated into a...
CanQualType getReturnType() const
MutableArrayRef< ArgInfo > arguments()
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::LLVMContext & getLLVMContext()
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A)
Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to F.
const LangOptions & getLangOpts() const
CodeGenTypes & getTypes()
const TargetInfo & getTarget() const
void handleAMDGPUFlatWorkGroupSizeAttr(llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A, const ReqdWorkGroupSizeAttr *ReqdWGS=nullptr, int32_t *MinThreadsVal=nullptr, int32_t *MaxThreadsVal=nullptr)
Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute to F.
const llvm::DataLayout & getDataLayout() const
ASTContext & getContext() const
const CodeGenOptions & getCodeGenOpts() const
void addDefaultFunctionDefinitionAttributes(llvm::AttrBuilder &attrs)
Like the overload taking a Function &, but intended specifically for frontends that want to build on ...
This class organizes the cross-module state that is used while lowering AST types to LLVM types.
DefaultABIInfo - The default implementation for ABI specific details.
ABIArgInfo classifyArgumentType(QualType RetTy) const
ABIArgInfo classifyReturnType(QualType RetTy) const
void computeInfo(CGFunctionInfo &FI) const override
Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty) const override
EmitVAArg - Emit the target dependent code to load a value of.
TargetCodeGenInfo - This class organizes various target-specific codegeneration issues,...
virtual void setCUDAKernelCallingConvention(const FunctionType *&FT) const
virtual llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, SyncScope Scope, llvm::AtomicOrdering Ordering, llvm::LLVMContext &Ctx) const
Get the syncscope used in LLVM IR.
const T & getABIInfo() const
virtual unsigned getOpenCLKernelCallingConv() const
Get LLVM calling convention for OpenCL kernel.
virtual LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, const VarDecl *D) const
Get target favored AST address space of a global variable for languages other than OpenCL and CUDA.
virtual void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const
setTargetAttributes - Provides a convenient hook to handle extra target-specific attributes for the g...
virtual bool shouldEmitDWARFBitFieldSeparators() const
virtual llvm::Constant * getNullPointer(const CodeGen::CodeGenModule &CGM, llvm::PointerType *T, QualType QT) const
Get target specific null pointer.
virtual LangAS getASTAllocaAddressSpace() const
Get the AST address space for alloca.
virtual llvm::Value * createEnqueuedBlockKernel(CodeGenFunction &CGF, llvm::Function *BlockInvokeFunc, llvm::Type *BlockTy) const
Create an OpenCL kernel for an enqueued block.
virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const
Provides a convenient hook to handle extra target-specific globals.
virtual bool shouldEmitStaticExternCAliases() const
Decl - This represents one declaration (or definition), e.g.
Represents a member of a struct/union/class.
Represents a function declaration or definition.
ExtInfo withCallingConv(CallingConv cc) const
FunctionType - C99 6.7.5.3 - Function Declarators.
ExtInfo getExtInfo() const
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
A (possibly-)qualified type.
LangAS getAddressSpace() const
Return the address space of this type.
bool isConstantStorage(const ASTContext &Ctx, bool ExcludeCtor, bool ExcludeDtor)
Represents a struct/union/class.
bool hasFlexibleArrayMember() const
field_range fields() const
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
Scope - A scope is a transient data structure that is used while parsing the program.
TargetOptions & getTargetOpts() const
Retrieve the target options.
virtual std::optional< LangAS > getConstantAddressSpace() const
Return an AST address space which can be used opportunistically for constant global memory.
bool allowAMDGPUUnsafeFPAtomics() const
Returns whether or not the AMDGPU unsafe floating point atomics are allowed.
llvm::CodeObjectVersionKind CodeObjectVersion
Code object version for AMDGPU.
The base class of the type hierarchy.
const T * getAs() const
Member-template getAs<specific type>'.
Represents a variable declaration or definition.
bool hasConstantInitialization() const
Determine whether this variable has constant initialization.
Represents a GCC generic vector type.
ABIArgInfo classifyArgumentType(CodeGenModule &CGM, CanQualType type)
Classify the rules for how to pass a particular type.
CGCXXABI::RecordArgABI getRecordArgABI(const RecordType *RT, CGCXXABI &CXXABI)
bool classifyReturnType(const CGCXXABI &CXXABI, CGFunctionInfo &FI, const ABIInfo &Info)
std::unique_ptr< TargetCodeGenInfo > createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM)
bool isAggregateTypeForABI(QualType T)
const Type * isSingleElementStruct(QualType T, ASTContext &Context)
isSingleElementStruct - Determine if a structure is a "single element struct", i.e.
QualType useFirstFieldIfTransparentUnion(QualType Ty)
Pass transparent unions as if they were the type of the first element.
bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays, bool AsIfNoUniqueAddr=false)
isEmptyRecord - Return true iff a structure contains only empty fields.
bool Cast(InterpState &S, CodePtr OpPC)
LangAS
Defines the address space values used by the address space qualifier of QualType.
SyncScope
Defines synch scope values used internally by clang.
LangAS getLangASFromTargetAS(unsigned TargetAS)