clang 19.0.0git
AMDGPU.cpp
Go to the documentation of this file.
1//===- AMDGPU.cpp ---------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ABIInfoImpl.h"
10#include "TargetInfo.h"
12
13using namespace clang;
14using namespace clang::CodeGen;
15
16//===----------------------------------------------------------------------===//
17// AMDGPU ABI Implementation
18//===----------------------------------------------------------------------===//
19
20namespace {
21
22class AMDGPUABIInfo final : public DefaultABIInfo {
23private:
24 static const unsigned MaxNumRegsForArgsRet = 16;
25
26 unsigned numRegsForType(QualType Ty) const;
27
28 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
30 uint64_t Members) const override;
31
32 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
33 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
34 unsigned ToAS) const {
35 // Single value types.
36 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
37 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38 return llvm::PointerType::get(Ty->getContext(), ToAS);
39 return Ty;
40 }
41
42public:
43 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
44 DefaultABIInfo(CGT) {}
45
47 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
48 ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;
49
50 void computeInfo(CGFunctionInfo &FI) const override;
52 QualType Ty) const override;
53};
54
55bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
56 return true;
57}
58
59bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
60 const Type *Base, uint64_t Members) const {
61 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
62
63 // Homogeneous Aggregates may occupy at most 16 registers.
64 return Members * NumRegs <= MaxNumRegsForArgsRet;
65}
66
67/// Estimate number of registers the type will use when passed in registers.
68unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
69 unsigned NumRegs = 0;
70
71 if (const VectorType *VT = Ty->getAs<VectorType>()) {
72 // Compute from the number of elements. The reported size is based on the
73 // in-memory size, which includes the padding 4th element for 3-vectors.
74 QualType EltTy = VT->getElementType();
75 unsigned EltSize = getContext().getTypeSize(EltTy);
76
77 // 16-bit element vectors should be passed as packed.
78 if (EltSize == 16)
79 return (VT->getNumElements() + 1) / 2;
80
81 unsigned EltNumRegs = (EltSize + 31) / 32;
82 return EltNumRegs * VT->getNumElements();
83 }
84
85 if (const RecordType *RT = Ty->getAs<RecordType>()) {
86 const RecordDecl *RD = RT->getDecl();
87 assert(!RD->hasFlexibleArrayMember());
88
89 for (const FieldDecl *Field : RD->fields()) {
90 QualType FieldTy = Field->getType();
91 NumRegs += numRegsForType(FieldTy);
92 }
93
94 return NumRegs;
95 }
96
97 return (getContext().getTypeSize(Ty) + 31) / 32;
98}
99
100void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
101 llvm::CallingConv::ID CC = FI.getCallingConvention();
102
103 if (!getCXXABI().classifyReturnType(FI))
105
106 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
107 for (auto &Arg : FI.arguments()) {
108 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
109 Arg.info = classifyKernelArgumentType(Arg.type);
110 } else {
111 Arg.info = classifyArgumentType(Arg.type, NumRegsLeft);
112 }
113 }
114}
115
116Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
117 QualType Ty) const {
118 llvm_unreachable("AMDGPU does not support varargs");
119}
120
121ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
122 if (isAggregateTypeForABI(RetTy)) {
123 // Records with non-trivial destructors/copy-constructors should not be
124 // returned by value.
125 if (!getRecordArgABI(RetTy, getCXXABI())) {
126 // Ignore empty structs/unions.
127 if (isEmptyRecord(getContext(), RetTy, true))
128 return ABIArgInfo::getIgnore();
129
130 // Lower single-element structs to just return a regular value.
131 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
132 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
133
134 if (const RecordType *RT = RetTy->getAs<RecordType>()) {
135 const RecordDecl *RD = RT->getDecl();
136 if (RD->hasFlexibleArrayMember())
138 }
139
140 // Pack aggregates <= 4 bytes into single VGPR or pair.
141 uint64_t Size = getContext().getTypeSize(RetTy);
142 if (Size <= 16)
143 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
144
145 if (Size <= 32)
146 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
147
148 if (Size <= 64) {
149 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
150 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
151 }
152
153 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
154 return ABIArgInfo::getDirect();
155 }
156 }
157
158 // Otherwise just do the default thing.
160}
161
162/// For kernels all parameters are really passed in a special buffer. It doesn't
163/// make sense to pass anything byval, so everything must be direct.
164ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
166
167 // TODO: Can we omit empty structs?
168
169 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
170 Ty = QualType(SeltTy, 0);
171
172 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
173 llvm::Type *LTy = OrigLTy;
174 if (getContext().getLangOpts().HIP) {
175 LTy = coerceKernelArgumentType(
176 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
177 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
178 }
179
180 // FIXME: Should also use this for OpenCL, but it requires addressing the
181 // problem of kernels being called.
182 //
183 // FIXME: This doesn't apply the optimization of coercing pointers in structs
184 // to global address space when using byref. This would require implementing a
185 // new kind of coercion of the in-memory type when for indirect arguments.
186 if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
189 getContext().getTypeAlignInChars(Ty),
190 getContext().getTargetAddressSpace(LangAS::opencl_constant),
191 false /*Realign*/, nullptr /*Padding*/);
192 }
193
194 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
195 // individual elements, which confuses the Clover OpenCL backend; therefore we
196 // have to set it to false here. Other args of getDirect() are just defaults.
197 return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
198}
199
200ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
201 unsigned &NumRegsLeft) const {
202 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
203
205
206 if (isAggregateTypeForABI(Ty)) {
207 // Records with non-trivial destructors/copy-constructors should not be
208 // passed by value.
209 if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
210 return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
211
212 // Ignore empty structs/unions.
213 if (isEmptyRecord(getContext(), Ty, true))
214 return ABIArgInfo::getIgnore();
215
216 // Lower single-element structs to just pass a regular value. TODO: We
217 // could do reasonable-size multiple-element structs too, using getExpand(),
218 // though watch out for things like bitfields.
219 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
220 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
221
222 if (const RecordType *RT = Ty->getAs<RecordType>()) {
223 const RecordDecl *RD = RT->getDecl();
224 if (RD->hasFlexibleArrayMember())
226 }
227
228 // Pack aggregates <= 8 bytes into single VGPR or pair.
229 uint64_t Size = getContext().getTypeSize(Ty);
230 if (Size <= 64) {
231 unsigned NumRegs = (Size + 31) / 32;
232 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
233
234 if (Size <= 16)
235 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
236
237 if (Size <= 32)
238 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
239
240 // XXX: Should this be i64 instead, and should the limit increase?
241 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
242 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
243 }
244
245 if (NumRegsLeft > 0) {
246 unsigned NumRegs = numRegsForType(Ty);
247 if (NumRegsLeft >= NumRegs) {
248 NumRegsLeft -= NumRegs;
249 return ABIArgInfo::getDirect();
250 }
251 }
252
253 // Use pass-by-reference in stead of pass-by-value for struct arguments in
254 // function ABI.
256 getContext().getTypeAlignInChars(Ty),
257 getContext().getTargetAddressSpace(LangAS::opencl_private));
258 }
259
260 // Otherwise just do the default thing.
262 if (!ArgInfo.isIndirect()) {
263 unsigned NumRegs = numRegsForType(Ty);
264 NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
265 }
266
267 return ArgInfo;
268}
269
270class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
271public:
272 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
273 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
274
275 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
276 CodeGenModule &CGM) const;
277
278 void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
279
280 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
281 CodeGen::CodeGenModule &M) const override;
282 unsigned getOpenCLKernelCallingConv() const override;
283
284 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
285 llvm::PointerType *T, QualType QT) const override;
286
287 LangAS getASTAllocaAddressSpace() const override {
289 getABIInfo().getDataLayout().getAllocaAddrSpace());
290 }
292 const VarDecl *D) const override;
293 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
295 llvm::AtomicOrdering Ordering,
296 llvm::LLVMContext &Ctx) const override;
298 llvm::Function *BlockInvokeFunc,
299 llvm::Type *BlockTy) const override;
300 bool shouldEmitStaticExternCAliases() const override;
301 bool shouldEmitDWARFBitFieldSeparators() const override;
302 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
303};
304}
305
307 llvm::GlobalValue *GV) {
308 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
309 return false;
310
311 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
312 (D->hasAttr<OpenCLKernelAttr>() ||
313 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
314 (isa<VarDecl>(D) &&
315 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
316 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
317 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
318}
319
320void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
321 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
322 const auto *ReqdWGS =
323 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
324 const bool IsOpenCLKernel =
325 M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
326 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
327
328 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
329 if (ReqdWGS || FlatWGS) {
330 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
331 } else if (IsOpenCLKernel || IsHIPKernel) {
332 // By default, restrict the maximum size to a value specified by
333 // --gpu-max-threads-per-block=n or its default value for HIP.
334 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
335 const unsigned DefaultMaxWorkGroupSize =
336 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
337 : M.getLangOpts().GPUMaxThreadsPerBlock;
338 std::string AttrVal =
339 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
340 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
341 }
342
343 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
345
346 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
347 unsigned NumSGPR = Attr->getNumSGPR();
348
349 if (NumSGPR != 0)
350 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
351 }
352
353 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
354 uint32_t NumVGPR = Attr->getNumVGPR();
355
356 if (NumVGPR != 0)
357 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
358 }
359
360 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
361 uint32_t X = Attr->getMaxNumWorkGroupsX()
362 ->EvaluateKnownConstInt(M.getContext())
363 .getExtValue();
364 // Y and Z dimensions default to 1 if not specified
365 uint32_t Y = Attr->getMaxNumWorkGroupsY()
366 ? Attr->getMaxNumWorkGroupsY()
367 ->EvaluateKnownConstInt(M.getContext())
368 .getExtValue()
369 : 1;
370 uint32_t Z = Attr->getMaxNumWorkGroupsZ()
371 ? Attr->getMaxNumWorkGroupsZ()
372 ->EvaluateKnownConstInt(M.getContext())
373 .getExtValue()
374 : 1;
375
376 llvm::SmallString<32> AttrVal;
377 llvm::raw_svector_ostream OS(AttrVal);
378 OS << X << ',' << Y << ',' << Z;
379
380 F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
381 }
382}
383
384/// Emits control constants used to change per-architecture behaviour in the
385/// AMDGPU ROCm device libraries.
386void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
387 CodeGen::CodeGenModule &CGM) const {
388 StringRef Name = "__oclc_ABI_version";
389 llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
390 if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
391 return;
392
394 llvm::CodeObjectVersionKind::COV_None)
395 return;
396
397 auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
398 llvm::Constant *COV = llvm::ConstantInt::get(
400
401 // It needs to be constant weak_odr without externally_initialized so that
402 // the load instuction can be eliminated by the IPSCCP.
403 auto *GV = new llvm::GlobalVariable(
404 CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
405 nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
406 CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
407 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
408 GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
409
410 // Replace any external references to this variable with the new global.
411 if (OriginalGV) {
412 OriginalGV->replaceAllUsesWith(GV);
413 GV->takeName(OriginalGV);
414 OriginalGV->eraseFromParent();
415 }
416}
417
418void AMDGPUTargetCodeGenInfo::setTargetAttributes(
419 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
421 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
422 GV->setDSOLocal(true);
423 }
424
425 if (GV->isDeclaration())
426 return;
427
428 llvm::Function *F = dyn_cast<llvm::Function>(GV);
429 if (!F)
430 return;
431
432 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
433 if (FD)
434 setFunctionDeclAttributes(FD, F, M);
435
437 F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
438
439 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
440 F->addFnAttr("amdgpu-ieee", "false");
441}
442
443unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
444 return llvm::CallingConv::AMDGPU_KERNEL;
445}
446
447// Currently LLVM assumes null pointers always have value 0,
448// which results in incorrectly transformed IR. Therefore, instead of
449// emitting null pointers in private and local address spaces, a null
450// pointer in generic address space is emitted which is casted to a
451// pointer in local or private address space.
452llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
453 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
454 QualType QT) const {
455 if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
456 return llvm::ConstantPointerNull::get(PT);
457
458 auto &Ctx = CGM.getContext();
459 auto NPT = llvm::PointerType::get(
460 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
461 return llvm::ConstantExpr::getAddrSpaceCast(
462 llvm::ConstantPointerNull::get(NPT), PT);
463}
464
465LangAS
466AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
467 const VarDecl *D) const {
468 assert(!CGM.getLangOpts().OpenCL &&
469 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
470 "Address space agnostic languages only");
471 LangAS DefaultGlobalAS = getLangASFromTargetAS(
472 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
473 if (!D)
474 return DefaultGlobalAS;
475
476 LangAS AddrSpace = D->getType().getAddressSpace();
477 if (AddrSpace != LangAS::Default)
478 return AddrSpace;
479
480 // Only promote to address space 4 if VarDecl has constant initialization.
481 if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
483 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
484 return *ConstAS;
485 }
486 return DefaultGlobalAS;
487}
488
489llvm::SyncScope::ID
490AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
492 llvm::AtomicOrdering Ordering,
493 llvm::LLVMContext &Ctx) const {
494 std::string Name;
495 switch (Scope) {
496 case SyncScope::HIPSingleThread:
497 case SyncScope::SingleScope:
498 Name = "singlethread";
499 break;
500 case SyncScope::HIPWavefront:
501 case SyncScope::OpenCLSubGroup:
502 case SyncScope::WavefrontScope:
503 Name = "wavefront";
504 break;
505 case SyncScope::HIPWorkgroup:
506 case SyncScope::OpenCLWorkGroup:
507 case SyncScope::WorkgroupScope:
508 Name = "workgroup";
509 break;
510 case SyncScope::HIPAgent:
511 case SyncScope::OpenCLDevice:
512 case SyncScope::DeviceScope:
513 Name = "agent";
514 break;
515 case SyncScope::SystemScope:
516 case SyncScope::HIPSystem:
517 case SyncScope::OpenCLAllSVMDevices:
518 Name = "";
519 break;
520 }
521
522 if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
523 if (!Name.empty())
524 Name = Twine(Twine(Name) + Twine("-")).str();
525
526 Name = Twine(Twine(Name) + Twine("one-as")).str();
527 }
528
529 return Ctx.getOrInsertSyncScopeID(Name);
530}
531
532bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
533 return false;
534}
535
536bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
537 return true;
538}
539
540void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
541 const FunctionType *&FT) const {
542 FT = getABIInfo().getContext().adjustFunctionType(
544}
545
546/// Create an OpenCL kernel for an enqueued block.
547///
548/// The type of the first argument (the block literal) is the struct type
549/// of the block literal instead of a pointer type. The first argument
550/// (block literal) is passed directly by value to the kernel. The kernel
551/// allocates the same type of struct on stack and stores the block literal
552/// to it and passes its pointer to the block invoke function. The kernel
553/// has "enqueued-block" function attribute and kernel argument metadata.
554llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
555 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
556 auto &Builder = CGF.Builder;
557 auto &C = CGF.getLLVMContext();
558
559 auto *InvokeFT = Invoke->getFunctionType();
567
568 ArgTys.push_back(BlockTy);
569 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
570 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
571 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
572 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
573 AccessQuals.push_back(llvm::MDString::get(C, "none"));
574 ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
575 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
576 ArgTys.push_back(InvokeFT->getParamType(I));
577 ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
578 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
579 AccessQuals.push_back(llvm::MDString::get(C, "none"));
580 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
581 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
582 ArgNames.push_back(
583 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
584 }
585 std::string Name = Invoke->getName().str() + "_kernel";
586 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
587 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
588 &CGF.CGM.getModule());
589 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
590
591 llvm::AttrBuilder KernelAttrs(C);
592 // FIXME: The invoke isn't applying the right attributes either
593 // FIXME: This is missing setTargetAttributes
595 KernelAttrs.addAttribute("enqueued-block");
596 F->addFnAttrs(KernelAttrs);
597
598 auto IP = CGF.Builder.saveIP();
599 auto *BB = llvm::BasicBlock::Create(C, "entry", F);
600 Builder.SetInsertPoint(BB);
601 const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
602 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
603 BlockPtr->setAlignment(BlockAlign);
604 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
605 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
607 Args.push_back(Cast);
608 for (llvm::Argument &A : llvm::drop_begin(F->args()))
609 Args.push_back(&A);
610 llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
611 call->setCallingConv(Invoke->getCallingConv());
612 Builder.CreateRetVoid();
613 Builder.restoreIP(IP);
614
615 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
616 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
617 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
618 F->setMetadata("kernel_arg_base_type",
619 llvm::MDNode::get(C, ArgBaseTypeNames));
620 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
621 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
622 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
623
624 return F;
625}
626
628 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
629 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
630 int32_t *MaxThreadsVal) {
631 unsigned Min = 0;
632 unsigned Max = 0;
633 if (FlatWGS) {
634 Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
635 Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
636 }
637 if (ReqdWGS && Min == 0 && Max == 0)
638 Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
639
640 if (Min != 0) {
641 assert(Min <= Max && "Min must be less than or equal Max");
642
643 if (MinThreadsVal)
644 *MinThreadsVal = Min;
645 if (MaxThreadsVal)
646 *MaxThreadsVal = Max;
647 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
648 if (F)
649 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
650 } else
651 assert(Max == 0 && "Max must be zero");
652}
653
655 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
656 unsigned Min =
657 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
658 unsigned Max =
659 Attr->getMax()
660 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
661 : 0;
662
663 if (Min != 0) {
664 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
665
666 std::string AttrVal = llvm::utostr(Min);
667 if (Max != 0)
668 AttrVal = AttrVal + "," + llvm::utostr(Max);
669 F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
670 } else
671 assert(Max == 0 && "Max must be zero");
672}
673
674std::unique_ptr<TargetCodeGenInfo>
676 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
677}
static bool requiresAMDGPUProtectedVisibility(const Decl *D, llvm::GlobalValue *GV)
Definition: AMDGPU.cpp:306
#define X(type, name)
Definition: Value.h:143
Defines the clang::TargetOptions class.
uint64_t getTargetNullPointerValue(QualType QT) const
Get target-dependent integer value for null pointer which is used for constant folding.
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:757
unsigned getTargetAddressSpace(LangAS AS) const
Attr - This represents one attribute.
Definition: Attr.h:42
ABIArgInfo - Helper class to encapsulate information about how a specific C type should be passed to ...
static ABIArgInfo getIgnore()
static ABIArgInfo getDirect(llvm::Type *T=nullptr, unsigned Offset=0, llvm::Type *Padding=nullptr, bool CanBeFlattened=true, unsigned Align=0)
static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace, bool Realign=false, llvm::Type *Padding=nullptr)
Pass this in memory using the IR byref attribute.
virtual bool isHomogeneousAggregateBaseType(QualType Ty) const
Definition: ABIInfo.cpp:47
virtual bool isHomogeneousAggregateSmallEnough(const Type *Base, uint64_t Members) const
Definition: ABIInfo.cpp:51
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:111
@ RAA_DirectInMemory
Pass it on the stack using its defined layout.
Definition: CGCXXABI.h:158
CGFunctionInfo - Class to encapsulate the information about a function definition.
unsigned getCallingConvention() const
getCallingConvention - Return the user specified calling convention, which has been translated into a...
CanQualType getReturnType() const
MutableArrayRef< ArgInfo > arguments()
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::LLVMContext & getLLVMContext()
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A)
Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to F.
Definition: AMDGPU.cpp:654
const LangOptions & getLangOpts() const
const TargetInfo & getTarget() const
void handleAMDGPUFlatWorkGroupSizeAttr(llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A, const ReqdWorkGroupSizeAttr *ReqdWGS=nullptr, int32_t *MinThreadsVal=nullptr, int32_t *MaxThreadsVal=nullptr)
Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute to F.
Definition: AMDGPU.cpp:627
const llvm::DataLayout & getDataLayout() const
ASTContext & getContext() const
const CodeGenOptions & getCodeGenOpts() const
void addDefaultFunctionDefinitionAttributes(llvm::AttrBuilder &attrs)
Like the overload taking a Function &, but intended specifically for frontends that want to build on ...
Definition: CGCall.cpp:2149
This class organizes the cross-module state that is used while lowering AST types to LLVM types.
Definition: CodeGenTypes.h:54
DefaultABIInfo - The default implementation for ABI specific details.
Definition: ABIInfoImpl.h:21
ABIArgInfo classifyArgumentType(QualType RetTy) const
Definition: ABIInfoImpl.cpp:17
ABIArgInfo classifyReturnType(QualType RetTy) const
Definition: ABIInfoImpl.cpp:45
void computeInfo(CGFunctionInfo &FI) const override
Definition: ABIInfoImpl.cpp:67
Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty) const override
EmitVAArg - Emit the target dependent code to load a value of.
Definition: ABIInfoImpl.cpp:74
TargetCodeGenInfo - This class organizes various target-specific codegeneration issues,...
Definition: TargetInfo.h:46
virtual void setCUDAKernelCallingConvention(const FunctionType *&FT) const
Definition: TargetInfo.h:376
virtual llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, SyncScope Scope, llvm::AtomicOrdering Ordering, llvm::LLVMContext &Ctx) const
Get the syncscope used in LLVM IR.
Definition: TargetInfo.cpp:154
const T & getABIInfo() const
Definition: TargetInfo.h:56
virtual unsigned getOpenCLKernelCallingConv() const
Get LLVM calling convention for OpenCL kernel.
Definition: TargetInfo.cpp:105
virtual LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, const VarDecl *D) const
Get target favored AST address space of a global variable for languages other than OpenCL and CUDA.
Definition: TargetInfo.cpp:124
virtual void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const
setTargetAttributes - Provides a convenient hook to handle extra target-specific attributes for the g...
Definition: TargetInfo.h:75
virtual bool shouldEmitDWARFBitFieldSeparators() const
Definition: TargetInfo.h:374
virtual llvm::Constant * getNullPointer(const CodeGen::CodeGenModule &CGM, llvm::PointerType *T, QualType QT) const
Get target specific null pointer.
Definition: TargetInfo.cpp:119
virtual LangAS getASTAllocaAddressSpace() const
Get the AST address space for alloca.
Definition: TargetInfo.h:296
virtual llvm::Value * createEnqueuedBlockKernel(CodeGenFunction &CGF, llvm::Function *BlockInvokeFunc, llvm::Type *BlockTy) const
Create an OpenCL kernel for an enqueued block.
Definition: TargetInfo.cpp:177
virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const
Provides a convenient hook to handle extra target-specific globals.
Definition: TargetInfo.h:85
virtual bool shouldEmitStaticExternCAliases() const
Definition: TargetInfo.h:369
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:85
T * getAttr() const
Definition: DeclBase.h:578
bool hasAttr() const
Definition: DeclBase.h:582
Represents a member of a struct/union/class.
Definition: Decl.h:3058
Represents a function declaration or definition.
Definition: Decl.h:1971
ExtInfo withCallingConv(CallingConv cc) const
Definition: Type.h:4272
FunctionType - C99 6.7.5.3 - Function Declarators.
Definition: Type.h:4046
ExtInfo getExtInfo() const
Definition: Type.h:4375
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:449
A (possibly-)qualified type.
Definition: Type.h:738
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:7275
bool isConstantStorage(const ASTContext &Ctx, bool ExcludeCtor, bool ExcludeDtor)
Definition: Type.h:837
Represents a struct/union/class.
Definition: Decl.h:4169
bool hasFlexibleArrayMember() const
Definition: Decl.h:4202
field_range fields() const
Definition: Decl.h:4375
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
Definition: Type.h:5339
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:41
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:307
virtual std::optional< LangAS > getConstantAddressSpace() const
Return an AST address space which can be used opportunistically for constant global memory.
Definition: TargetInfo.h:1611
bool allowAMDGPUUnsafeFPAtomics() const
Returns whether or not the AMDGPU unsafe floating point atomics are allowed.
Definition: TargetInfo.h:1027
llvm::CodeObjectVersionKind CodeObjectVersion
Code object version for AMDGPU.
Definition: TargetOptions.h:82
The base class of the type hierarchy.
Definition: Type.h:1607
const T * getAs() const
Member-template getAs<specific type>'.
Definition: Type.h:7913
QualType getType() const
Definition: Decl.h:717
Represents a variable declaration or definition.
Definition: Decl.h:918
bool hasConstantInitialization() const
Determine whether this variable has constant initialization.
Definition: Decl.cpp:2624
Represents a GCC generic vector type.
Definition: Type.h:3759
ABIArgInfo classifyArgumentType(CodeGenModule &CGM, CanQualType type)
Classify the rules for how to pass a particular type.
CGCXXABI::RecordArgABI getRecordArgABI(const RecordType *RT, CGCXXABI &CXXABI)
bool classifyReturnType(const CGCXXABI &CXXABI, CGFunctionInfo &FI, const ABIInfo &Info)
std::unique_ptr< TargetCodeGenInfo > createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM)
Definition: AMDGPU.cpp:675
bool isAggregateTypeForABI(QualType T)
const Type * isSingleElementStruct(QualType T, ASTContext &Context)
isSingleElementStruct - Determine if a structure is a "single element struct", i.e.
QualType useFirstFieldIfTransparentUnion(QualType Ty)
Pass transparent unions as if they were the type of the first element.
bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays, bool AsIfNoUniqueAddr=false)
isEmptyRecord - Return true iff a structure contains only empty fields.
bool Cast(InterpState &S, CodePtr OpPC)
Definition: Interp.h:1706
The JSON file list parser is used to communicate input to InstallAPI.
@ OpenCL
Definition: LangStandard.h:65
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
const FunctionProtoType * T
SyncScope
Defines synch scope values used internally by clang.
Definition: SyncScope.h:42
@ CC_OpenCLKernel
Definition: Specifiers.h:289
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86
unsigned long uint64_t
Definition: Format.h:5394