10#include "TargetInfo.h"
24 static const unsigned MaxNumRegsForArgsRet = 16;
26 unsigned numRegsForType(
QualType Ty)
const;
30 uint64_t Members)
const override;
33 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty,
unsigned FromAS,
34 unsigned ToAS)
const {
36 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
37 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38 return llvm::PointerType::get(Ty->getContext(), ToAS);
49 unsigned &NumRegsLeft)
const;
56bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(
QualType Ty)
const {
60bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
61 const Type *
Base, uint64_t Members)
const {
62 uint32_t NumRegs = (getContext().getTypeSize(
Base) + 31) / 32;
65 return Members * NumRegs <= MaxNumRegsForArgsRet;
69unsigned AMDGPUABIInfo::numRegsForType(
QualType Ty)
const {
75 QualType EltTy = VT->getElementType();
76 unsigned EltSize = getContext().getTypeSize(EltTy);
80 return (VT->getNumElements() + 1) / 2;
82 unsigned EltNumRegs = (EltSize + 31) / 32;
83 return EltNumRegs * VT->getNumElements();
92 NumRegs += numRegsForType(FieldTy);
98 return (getContext().getTypeSize(Ty) + 31) / 32;
107 unsigned ArgumentIndex = 0;
110 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
112 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
113 Arg.info = classifyKernelArgumentType(Arg.type);
115 bool FixedArgument = ArgumentIndex++ < numFixedArguments;
123 const bool IsIndirect =
false;
124 const bool AllowHigherAlign =
false;
126 getContext().getTypeInfoInChars(Ty),
158 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
162 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
181 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
182 llvm::Type *LTy = OrigLTy;
183 if (getContext().getLangOpts().
HIP) {
184 LTy = coerceKernelArgumentType(
185 OrigLTy, getContext().getTargetAddressSpace(LangAS::Default),
186 getContext().getTargetAddressSpace(LangAS::cuda_device));
195 if (!getContext().getLangOpts().
OpenCL && LTy == OrigLTy &&
198 getContext().getTypeAlignInChars(Ty),
199 getContext().getTargetAddressSpace(LangAS::opencl_constant),
210 unsigned &NumRegsLeft)
const {
211 assert(NumRegsLeft <= MaxNumRegsForArgsRet &&
"register estimate underflow");
248 unsigned NumRegs = (
Size + 31) / 32;
249 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
258 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
262 if (NumRegsLeft > 0) {
263 unsigned NumRegs = numRegsForType(Ty);
264 if (NumRegsLeft >= NumRegs) {
265 NumRegsLeft -= NumRegs;
273 getContext().getTypeAlignInChars(Ty),
274 getContext().getTargetAddressSpace(LangAS::opencl_private));
280 unsigned NumRegs = numRegsForType(Ty);
281 NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
292 void setFunctionDeclAttributes(
const FunctionDecl *FD, llvm::Function *F,
302 llvm::PointerType *
T,
QualType QT)
const override;
306 getABIInfo().getDataLayout().getAllocaAddrSpace());
312 llvm::AtomicOrdering Ordering,
313 llvm::LLVMContext &Ctx)
const override;
315 llvm::Function *BlockInvokeFunc,
316 llvm::Type *BlockTy)
const override;
324 llvm::GlobalValue *GV) {
325 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
328 return !
D->
hasAttr<OMPDeclareTargetDeclAttr>() &&
330 (isa<FunctionDecl>(
D) &&
D->
hasAttr<CUDAGlobalAttr>()) ||
333 cast<VarDecl>(
D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
334 cast<VarDecl>(
D)->getType()->isCUDADeviceBuiltinTextureType())));
337void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
339 const auto *ReqdWGS =
341 const bool IsOpenCLKernel =
345 const auto *FlatWGS = FD->
getAttr<AMDGPUFlatWorkGroupSizeAttr>();
346 if (ReqdWGS || FlatWGS) {
348 }
else if (IsOpenCLKernel || IsHIPKernel) {
351 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
352 const unsigned DefaultMaxWorkGroupSize =
353 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
355 std::string AttrVal =
356 std::string(
"1,") + llvm::utostr(DefaultMaxWorkGroupSize);
357 F->addFnAttr(
"amdgpu-flat-work-group-size", AttrVal);
360 if (
const auto *
Attr = FD->
getAttr<AMDGPUWavesPerEUAttr>())
363 if (
const auto *
Attr = FD->
getAttr<AMDGPUNumSGPRAttr>()) {
364 unsigned NumSGPR =
Attr->getNumSGPR();
367 F->addFnAttr(
"amdgpu-num-sgpr", llvm::utostr(NumSGPR));
370 if (
const auto *
Attr = FD->
getAttr<AMDGPUNumVGPRAttr>()) {
371 uint32_t NumVGPR =
Attr->getNumVGPR();
374 F->addFnAttr(
"amdgpu-num-vgpr", llvm::utostr(NumVGPR));
377 if (
const auto *
Attr = FD->
getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
378 uint32_t
X =
Attr->getMaxNumWorkGroupsX()
382 uint32_t Y =
Attr->getMaxNumWorkGroupsY()
383 ?
Attr->getMaxNumWorkGroupsY()
387 uint32_t Z =
Attr->getMaxNumWorkGroupsZ()
388 ?
Attr->getMaxNumWorkGroupsZ()
394 llvm::raw_svector_ostream OS(AttrVal);
395 OS <<
X <<
',' << Y <<
',' << Z;
397 F->addFnAttr(
"amdgpu-max-num-workgroups", AttrVal.str());
403void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
405 StringRef Name =
"__oclc_ABI_version";
406 llvm::GlobalVariable *OriginalGV = CGM.
getModule().getNamedGlobal(Name);
407 if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
411 llvm::CodeObjectVersionKind::COV_None)
414 auto *
Type = llvm::IntegerType::getIntNTy(CGM.
getModule().getContext(), 32);
415 llvm::Constant *COV = llvm::ConstantInt::get(
420 auto *GV =
new llvm::GlobalVariable(
421 CGM.
getModule(),
Type,
true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
422 nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
424 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
425 GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
429 OriginalGV->replaceAllUsesWith(GV);
430 GV->takeName(OriginalGV);
431 OriginalGV->eraseFromParent();
435void AMDGPUTargetCodeGenInfo::setTargetAttributes(
438 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
439 GV->setDSOLocal(
true);
442 if (GV->isDeclaration())
445 llvm::Function *F = dyn_cast<llvm::Function>(GV);
451 setFunctionDeclAttributes(FD, F, M);
454 F->addFnAttr(
"amdgpu-unsafe-fp-atomics",
"true");
456 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
457 F->addFnAttr(
"amdgpu-ieee",
"false");
460unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv()
const {
461 return llvm::CallingConv::AMDGPU_KERNEL;
469llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
473 return llvm::ConstantPointerNull::get(PT);
476 auto NPT = llvm::PointerType::get(
477 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
478 return llvm::ConstantExpr::getAddrSpaceCast(
479 llvm::ConstantPointerNull::get(NPT), PT);
483AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(
CodeGenModule &CGM,
487 "Address space agnostic languages only");
491 return DefaultGlobalAS;
493 LangAS AddrSpace =
D->getType().getAddressSpace();
494 if (AddrSpace != LangAS::Default)
498 if (
D->getType().isConstantStorage(CGM.
getContext(),
false,
false) &&
499 D->hasConstantInitialization()) {
503 return DefaultGlobalAS;
507AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(
const LangOptions &LangOpts,
509 llvm::AtomicOrdering Ordering,
510 llvm::LLVMContext &Ctx)
const {
513 case SyncScope::HIPSingleThread:
514 case SyncScope::SingleScope:
515 Name =
"singlethread";
517 case SyncScope::HIPWavefront:
518 case SyncScope::OpenCLSubGroup:
519 case SyncScope::WavefrontScope:
522 case SyncScope::HIPWorkgroup:
523 case SyncScope::OpenCLWorkGroup:
524 case SyncScope::WorkgroupScope:
527 case SyncScope::HIPAgent:
528 case SyncScope::OpenCLDevice:
529 case SyncScope::DeviceScope:
532 case SyncScope::SystemScope:
533 case SyncScope::HIPSystem:
534 case SyncScope::OpenCLAllSVMDevices:
539 if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
541 Name = Twine(Twine(Name) + Twine(
"-")).str();
543 Name = Twine(Twine(Name) + Twine(
"one-as")).str();
546 return Ctx.getOrInsertSyncScopeID(Name);
549bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases()
const {
553bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators()
const {
557void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
559 FT = getABIInfo().getContext().adjustFunctionType(
571llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
572 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy)
const {
576 auto *InvokeFT = Invoke->getFunctionType();
585 ArgTys.push_back(BlockTy);
586 ArgTypeNames.push_back(llvm::MDString::get(
C,
"__block_literal"));
587 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
588 ArgBaseTypeNames.push_back(llvm::MDString::get(
C,
"__block_literal"));
589 ArgTypeQuals.push_back(llvm::MDString::get(
C,
""));
590 AccessQuals.push_back(llvm::MDString::get(
C,
"none"));
591 ArgNames.push_back(llvm::MDString::get(
C,
"block_literal"));
592 for (
unsigned I = 1,
E = InvokeFT->getNumParams(); I <
E; ++I) {
593 ArgTys.push_back(InvokeFT->getParamType(I));
594 ArgTypeNames.push_back(llvm::MDString::get(
C,
"void*"));
595 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
596 AccessQuals.push_back(llvm::MDString::get(
C,
"none"));
597 ArgBaseTypeNames.push_back(llvm::MDString::get(
C,
"void*"));
598 ArgTypeQuals.push_back(llvm::MDString::get(
C,
""));
600 llvm::MDString::get(
C, (Twine(
"local_arg") + Twine(I)).str()));
602 std::string Name = Invoke->getName().str() +
"_kernel";
603 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(
C), ArgTys,
false);
604 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
606 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
608 llvm::AttrBuilder KernelAttrs(
C);
612 KernelAttrs.addAttribute(
"enqueued-block");
613 F->addFnAttrs(KernelAttrs);
615 auto IP = CGF.
Builder.saveIP();
616 auto *BB = llvm::BasicBlock::Create(
C,
"entry", F);
617 Builder.SetInsertPoint(BB);
619 auto *BlockPtr = Builder.CreateAlloca(BlockTy,
nullptr);
620 BlockPtr->setAlignment(BlockAlign);
621 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
622 auto *
Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
624 Args.push_back(Cast);
625 for (llvm::Argument &A : llvm::drop_begin(F->args()))
627 llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
628 call->setCallingConv(Invoke->getCallingConv());
629 Builder.CreateRetVoid();
630 Builder.restoreIP(IP);
632 F->setMetadata(
"kernel_arg_addr_space", llvm::MDNode::get(
C, AddressQuals));
633 F->setMetadata(
"kernel_arg_access_qual", llvm::MDNode::get(
C, AccessQuals));
634 F->setMetadata(
"kernel_arg_type", llvm::MDNode::get(
C, ArgTypeNames));
635 F->setMetadata(
"kernel_arg_base_type",
636 llvm::MDNode::get(
C, ArgBaseTypeNames));
637 F->setMetadata(
"kernel_arg_type_qual", llvm::MDNode::get(
C, ArgTypeQuals));
639 F->setMetadata(
"kernel_arg_name", llvm::MDNode::get(
C, ArgNames));
645 llvm::Function *F,
const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
646 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
647 int32_t *MaxThreadsVal) {
651 Min = FlatWGS->getMin()->EvaluateKnownConstInt(
getContext()).getExtValue();
652 Max = FlatWGS->getMax()->EvaluateKnownConstInt(
getContext()).getExtValue();
654 if (ReqdWGS &&
Min == 0 &&
Max == 0)
655 Min =
Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
658 assert(
Min <=
Max &&
"Min must be less than or equal Max");
661 *MinThreadsVal =
Min;
663 *MaxThreadsVal =
Max;
664 std::string AttrVal = llvm::utostr(
Min) +
"," + llvm::utostr(
Max);
666 F->addFnAttr(
"amdgpu-flat-work-group-size", AttrVal);
668 assert(
Max == 0 &&
"Max must be zero");
672 llvm::Function *F,
const AMDGPUWavesPerEUAttr *
Attr) {
674 Attr->getMin()->EvaluateKnownConstInt(
getContext()).getExtValue();
677 ?
Attr->getMax()->EvaluateKnownConstInt(
getContext()).getExtValue()
681 assert((
Max == 0 ||
Min <=
Max) &&
"Min must be less than or equal Max");
683 std::string AttrVal = llvm::utostr(
Min);
685 AttrVal = AttrVal +
"," + llvm::utostr(
Max);
686 F->addFnAttr(
"amdgpu-waves-per-eu", AttrVal);
688 assert(
Max == 0 &&
"Max must be zero");
691std::unique_ptr<TargetCodeGenInfo>
693 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.
getTypes());
static bool requiresAMDGPUProtectedVisibility(const Decl *D, llvm::GlobalValue *GV)
Defines the clang::TargetOptions class.
uint64_t getTargetNullPointerValue(QualType QT) const
Get target-dependent integer value for null pointer which is used for constant folding.
const TargetInfo & getTargetInfo() const
unsigned getTargetAddressSpace(LangAS AS) const
Attr - This represents one attribute.
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
ABIArgInfo - Helper class to encapsulate information about how a specific C type should be passed to ...
static ABIArgInfo getIgnore()
static ABIArgInfo getDirect(llvm::Type *T=nullptr, unsigned Offset=0, llvm::Type *Padding=nullptr, bool CanBeFlattened=true, unsigned Align=0)
static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace, bool Realign=false, llvm::Type *Padding=nullptr)
Pass this in memory using the IR byref attribute.
virtual bool isHomogeneousAggregateBaseType(QualType Ty) const
virtual bool isHomogeneousAggregateSmallEnough(const Type *Base, uint64_t Members) const
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
@ RAA_DirectInMemory
Pass it on the stack using its defined layout.
CGFunctionInfo - Class to encapsulate the information about a function definition.
ABIArgInfo & getReturnInfo()
unsigned getCallingConvention() const
getCallingConvention - Return the user specified calling convention, which has been translated into a...
CanQualType getReturnType() const
MutableArrayRef< ArgInfo > arguments()
unsigned getNumRequiredArgs() const
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::LLVMContext & getLLVMContext()
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A)
Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to F.
const LangOptions & getLangOpts() const
CodeGenTypes & getTypes()
const TargetInfo & getTarget() const
void handleAMDGPUFlatWorkGroupSizeAttr(llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A, const ReqdWorkGroupSizeAttr *ReqdWGS=nullptr, int32_t *MinThreadsVal=nullptr, int32_t *MaxThreadsVal=nullptr)
Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute to F.
const llvm::DataLayout & getDataLayout() const
ASTContext & getContext() const
const CodeGenOptions & getCodeGenOpts() const
void addDefaultFunctionDefinitionAttributes(llvm::AttrBuilder &attrs)
Like the overload taking a Function &, but intended specifically for frontends that want to build on ...
This class organizes the cross-module state that is used while lowering AST types to LLVM types.
DefaultABIInfo - The default implementation for ABI specific details.
ABIArgInfo classifyArgumentType(QualType RetTy) const
RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty, AggValueSlot Slot) const override
EmitVAArg - Emit the target dependent code to load a value of.
ABIArgInfo classifyReturnType(QualType RetTy) const
void computeInfo(CGFunctionInfo &FI) const override
RValue - This trivial value class is used to represent the result of an expression that is evaluated.
TargetCodeGenInfo - This class organizes various target-specific codegeneration issues,...
virtual void setCUDAKernelCallingConvention(const FunctionType *&FT) const
virtual llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, SyncScope Scope, llvm::AtomicOrdering Ordering, llvm::LLVMContext &Ctx) const
Get the syncscope used in LLVM IR.
const T & getABIInfo() const
virtual unsigned getOpenCLKernelCallingConv() const
Get LLVM calling convention for OpenCL kernel.
virtual LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, const VarDecl *D) const
Get target favored AST address space of a global variable for languages other than OpenCL and CUDA.
virtual void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const
setTargetAttributes - Provides a convenient hook to handle extra target-specific attributes for the g...
virtual bool shouldEmitDWARFBitFieldSeparators() const
virtual llvm::Constant * getNullPointer(const CodeGen::CodeGenModule &CGM, llvm::PointerType *T, QualType QT) const
Get target specific null pointer.
virtual LangAS getASTAllocaAddressSpace() const
Get the AST address space for alloca.
virtual llvm::Value * createEnqueuedBlockKernel(CodeGenFunction &CGF, llvm::Function *BlockInvokeFunc, llvm::Type *BlockTy) const
Create an OpenCL kernel for an enqueued block.
virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const
Provides a convenient hook to handle extra target-specific globals.
virtual bool shouldEmitStaticExternCAliases() const
Decl - This represents one declaration (or definition), e.g.
Represents a member of a struct/union/class.
Represents a function declaration or definition.
ExtInfo withCallingConv(CallingConv cc) const
FunctionType - C99 6.7.5.3 - Function Declarators.
ExtInfo getExtInfo() const
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
A (possibly-)qualified type.
Represents a struct/union/class.
bool hasFlexibleArrayMember() const
field_range fields() const
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
Scope - A scope is a transient data structure that is used while parsing the program.
TargetOptions & getTargetOpts() const
Retrieve the target options.
virtual std::optional< LangAS > getConstantAddressSpace() const
Return an AST address space which can be used opportunistically for constant global memory.
bool allowAMDGPUUnsafeFPAtomics() const
Returns whether or not the AMDGPU unsafe floating point atomics are allowed.
llvm::CodeObjectVersionKind CodeObjectVersion
Code object version for AMDGPU.
The base class of the type hierarchy.
const T * getAs() const
Member-template getAs<specific type>'.
Represents a variable declaration or definition.
Represents a GCC generic vector type.
ABIArgInfo classifyArgumentType(CodeGenModule &CGM, CanQualType type)
Classify the rules for how to pass a particular type.
CGCXXABI::RecordArgABI getRecordArgABI(const RecordType *RT, CGCXXABI &CXXABI)
bool classifyReturnType(const CGCXXABI &CXXABI, CGFunctionInfo &FI, const ABIInfo &Info)
std::unique_ptr< TargetCodeGenInfo > createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM)
RValue emitVoidPtrVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType ValueTy, bool IsIndirect, TypeInfoChars ValueInfo, CharUnits SlotSizeAndAlign, bool AllowHigherAlign, AggValueSlot Slot, bool ForceRightAdjust=false)
Emit va_arg for a platform using the common void* representation, where arguments are simply emitted ...
bool isAggregateTypeForABI(QualType T)
const Type * isSingleElementStruct(QualType T, ASTContext &Context)
isSingleElementStruct - Determine if a structure is a "single element struct", i.e.
QualType useFirstFieldIfTransparentUnion(QualType Ty)
Pass transparent unions as if they were the type of the first element.
bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays, bool AsIfNoUniqueAddr=false)
isEmptyRecord - Return true iff a structure contains only empty fields.
bool Cast(InterpState &S, CodePtr OpPC)
The JSON file list parser is used to communicate input to InstallAPI.
LangAS
Defines the address space values used by the address space qualifier of QualType.
const FunctionProtoType * T
SyncScope
Defines synch scope values used internally by clang.
LangAS getLangASFromTargetAS(unsigned TargetAS)