clang 23.0.0git
AMDGPU.cpp
Go to the documentation of this file.
1//===- AMDGPU.cpp ---------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ABIInfoImpl.h"
10#include "TargetInfo.h"
11#include "clang/AST/DeclCXX.h"
12#include "llvm/ADT/StringExtras.h"
13#include "llvm/Support/AMDGPUAddrSpace.h"
14
15using namespace clang;
16using namespace clang::CodeGen;
17
18//===----------------------------------------------------------------------===//
19// AMDGPU ABI Implementation
20//===----------------------------------------------------------------------===//
21
22namespace {
23
24class AMDGPUABIInfo final : public DefaultABIInfo {
25private:
26 static const unsigned MaxNumRegsForArgsRet = 16;
27
28 uint64_t numRegsForType(QualType Ty) const;
29
30 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
31 bool isHomogeneousAggregateSmallEnough(const Type *Base,
32 uint64_t Members) const override;
33
34 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
35 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
36 unsigned ToAS) const {
37 // Single value types.
38 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
39 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
40 return llvm::PointerType::get(Ty->getContext(), ToAS);
41 return Ty;
42 }
43
44public:
45 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
46 DefaultABIInfo(CGT) {}
47
48 ABIArgInfo classifyReturnType(QualType RetTy) const;
49 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
50 ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
51 unsigned &NumRegsLeft) const;
52
53 void computeInfo(CGFunctionInfo &FI) const override;
54 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
55 AggValueSlot Slot) const override;
56
57 llvm::FixedVectorType *
58 getOptimalVectorMemoryType(llvm::FixedVectorType *T,
59 const LangOptions &Opt) const override {
60 // We have legal instructions for 96-bit so 3x32 can be supported.
61 // FIXME: This check should be a subtarget feature as technically SI doesn't
62 // support it.
63 if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96)
64 return T;
65 return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt);
66 }
67};
68
69bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
70 return true;
71}
72
73bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
74 const Type *Base, uint64_t Members) const {
75 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
76
77 // Homogeneous Aggregates may occupy at most 16 registers.
78 return Members * NumRegs <= MaxNumRegsForArgsRet;
79}
80
81/// Check if all fields in an aggregate type contain only sub-32-bit integer
82/// types. Such aggregates should be packed into i32 registers rather than
83/// passed as individual elements. Aggregates containing floats or full-sized
84/// integer types (i32, i64) should preserve their original types.
85static bool containsOnlyPackableIntegerTypes(const RecordDecl *RD,
86 const ASTContext &Context) {
87 for (const FieldDecl *Field : RD->fields()) {
88 QualType FieldTy = Field->getType();
89
90 // For bitfields, they are always integer types so they're always packable.
91 // A bitfield like "unsigned a : 4" should be packable even though
92 // 'unsigned' is 32 bits. Similarly, larger bitfields that fill into
93 // wider ints (like i64) should also be packed.
94 if (Field->isBitField()) {
95 continue;
96 }
97
98 // Recursively check nested structs
99 if (const RecordDecl *NestedRD = FieldTy->getAsRecordDecl()) {
100 if (!containsOnlyPackableIntegerTypes(NestedRD, Context))
101 return false;
102 continue;
103 }
104
105 // Arrays - check the element type
106 if (const ConstantArrayType *AT = Context.getAsConstantArrayType(FieldTy)) {
107 QualType EltTy = AT->getElementType();
108 if (const RecordDecl *NestedRD = EltTy->getAsRecordDecl()) {
109 if (!containsOnlyPackableIntegerTypes(NestedRD, Context))
110 return false;
111 continue;
112 }
113 // For non-struct array elements, check if they're packable integers
114 if (!EltTy->isIntegerType())
115 return false;
116 uint64_t EltSize = Context.getTypeSize(EltTy);
117 if (EltSize >= 32)
118 return false;
119 continue;
120 }
121
122 // Floating point types should not be packed into integers
123 if (FieldTy->isFloatingType())
124 return false;
125
126 // Only integer types that are smaller than 32 bits should be packed
127 if (!FieldTy->isIntegerType())
128 return false;
129
130 uint64_t FieldSize = Context.getTypeSize(FieldTy);
131 if (FieldSize >= 32)
132 return false;
133 }
134 return true;
135}
136
137/// Estimate number of registers the type will use when passed in registers.
138uint64_t AMDGPUABIInfo::numRegsForType(QualType Ty) const {
139 uint64_t NumRegs = 0;
140
141 if (const VectorType *VT = Ty->getAs<VectorType>()) {
142 // Compute from the number of elements. The reported size is based on the
143 // in-memory size, which includes the padding 4th element for 3-vectors.
144 QualType EltTy = VT->getElementType();
145 uint64_t EltSize = getContext().getTypeSize(EltTy);
146
147 // 16-bit element vectors should be passed as packed.
148 if (EltSize == 16)
149 return (VT->getNumElements() + 1) / 2;
150
151 uint64_t EltNumRegs = (EltSize + 31) / 32;
152 return EltNumRegs * VT->getNumElements();
153 }
154
155 if (const auto *RD = Ty->getAsRecordDecl()) {
156 assert(!RD->hasFlexibleArrayMember());
157
158 for (const FieldDecl *Field : RD->fields()) {
159 QualType FieldTy = Field->getType();
160 NumRegs += numRegsForType(FieldTy);
161 }
162
163 return NumRegs;
164 }
165
166 return (getContext().getTypeSize(Ty) + 31) / 32;
167}
168
169void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
170 llvm::CallingConv::ID CC = FI.getCallingConvention();
171
172 if (!getCXXABI().classifyReturnType(FI))
174
175 unsigned ArgumentIndex = 0;
176 const unsigned numFixedArguments = FI.getNumRequiredArgs();
177
178 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
179 for (auto &Arg : FI.arguments()) {
180 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
181 Arg.info = classifyKernelArgumentType(Arg.type);
182 } else {
183 bool FixedArgument = ArgumentIndex++ < numFixedArguments;
184 Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
185 }
186 }
187}
188
189RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
190 QualType Ty, AggValueSlot Slot) const {
191 const bool IsIndirect = false;
192 const bool AllowHigherAlign = false;
193 return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
194 getContext().getTypeInfoInChars(Ty),
195 CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
196}
197
198ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
199 if (isAggregateTypeForABI(RetTy)) {
200 // Records with non-trivial destructors/copy-constructors should not be
201 // returned by value.
202 if (!getRecordArgABI(RetTy, getCXXABI())) {
203 // Ignore empty structs/unions.
204 if (isEmptyRecord(getContext(), RetTy, true))
205 return ABIArgInfo::getIgnore();
206
207 // Lower single-element structs to just return a regular value.
208 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
209 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
210
211 if (const auto *RD = RetTy->getAsRecordDecl();
212 RD && RD->hasFlexibleArrayMember())
214
215 // Pack aggregates <= 8 bytes into single VGPR or pair, but only if they
216 // contain sub-32-bit integer types. Aggregates with floats or full-sized
217 // integers should preserve their original types.
218 uint64_t Size = getContext().getTypeSize(RetTy);
219 if (Size <= 64) {
220 const RecordDecl *RD = RetTy->getAsRecordDecl();
221 bool ShouldPackToInt =
222 RD && containsOnlyPackableIntegerTypes(RD, getContext());
223
224 if (ShouldPackToInt) {
225 if (Size <= 16)
227 llvm::Type::getInt16Ty(getVMContext()));
228
229 if (Size <= 32)
231 llvm::Type::getInt32Ty(getVMContext()));
232
233 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
234 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
235 }
236 }
237
238 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
239 return ABIArgInfo::getDirect();
240 }
241 }
242
243 // Otherwise just do the default thing.
245}
246
247/// For kernels all parameters are really passed in a special buffer. It doesn't
248/// make sense to pass anything byval, so everything must be direct.
249ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
251
252 // TODO: Can we omit empty structs?
253
254 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
255 Ty = QualType(SeltTy, 0);
256
257 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
258 llvm::Type *LTy = OrigLTy;
259 if (getContext().getLangOpts().HIP) {
260 LTy = coerceKernelArgumentType(
261 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
262 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
263 }
264
265 // FIXME: This doesn't apply the optimization of coercing pointers in structs
266 // to global address space when using byref. This would require implementing a
267 // new kind of coercion of the in-memory type when for indirect arguments.
268 if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) {
270 getContext().getTypeAlignInChars(Ty),
271 getContext().getTargetAddressSpace(LangAS::opencl_constant),
272 false /*Realign*/, nullptr /*Padding*/);
273 }
274
275 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
276 // individual elements, which confuses the Clover OpenCL backend; therefore we
277 // have to set it to false here. Other args of getDirect() are just defaults.
278 return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
279}
280
281ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
282 unsigned &NumRegsLeft) const {
283 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
284
286
287 if (Variadic) {
288 return ABIArgInfo::getDirect(/*T=*/nullptr,
289 /*Offset=*/0,
290 /*Padding=*/nullptr,
291 /*CanBeFlattened=*/false,
292 /*Align=*/0);
293 }
294
295 if (isAggregateTypeForABI(Ty)) {
296 // Records with non-trivial destructors/copy-constructors should not be
297 // passed by value.
298 if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
299 return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
301
302 // Ignore empty structs/unions.
303 if (isEmptyRecord(getContext(), Ty, true))
304 return ABIArgInfo::getIgnore();
305
306 // Lower single-element structs to just pass a regular value. TODO: We
307 // could do reasonable-size multiple-element structs too, using getExpand(),
308 // though watch out for things like bitfields.
309 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
310 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
311
312 if (const auto *RD = Ty->getAsRecordDecl();
313 RD && RD->hasFlexibleArrayMember())
315
316 // Pack aggregates <= 8 bytes into single VGPR or pair, but only if they
317 // contain sub-32-bit integer types. Aggregates with floats or full-sized
318 // integers (i32, i64) should preserve their original types.
319 uint64_t Size = getContext().getTypeSize(Ty);
320 if (Size <= 64) {
321 const RecordDecl *RD = Ty->getAsRecordDecl();
322 bool ShouldPackToInt =
323 RD && containsOnlyPackableIntegerTypes(RD, getContext());
324
325 if (ShouldPackToInt) {
326 unsigned NumRegs = (Size + 31) / 32;
327 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
328
329 if (Size <= 16)
330 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
331
332 if (Size <= 32)
333 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
334
335 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
336 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
337 }
338 }
339
340 if (NumRegsLeft > 0) {
341 uint64_t NumRegs = numRegsForType(Ty);
342 if (NumRegsLeft >= NumRegs) {
343 NumRegsLeft -= NumRegs;
344 return ABIArgInfo::getDirect();
345 }
346 }
347
348 // Use pass-by-reference in stead of pass-by-value for struct arguments in
349 // function ABI.
351 getContext().getTypeAlignInChars(Ty),
352 getContext().getTargetAddressSpace(LangAS::opencl_private));
353 }
354
355 // Otherwise just do the default thing.
356 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
357 if (!ArgInfo.isIndirect()) {
358 uint64_t NumRegs = numRegsForType(Ty);
359 NumRegsLeft -= std::min(NumRegs, uint64_t{NumRegsLeft});
360 }
361
362 return ArgInfo;
363}
364
365class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
366public:
367 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
368 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
369
370 bool supportsLibCall() const override { return false; }
371 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
372 CodeGenModule &CGM) const;
373
374 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
375 CodeGen::CodeGenModule &M) const override;
376 unsigned getDeviceKernelCallingConv() const override;
377
378 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
379 llvm::PointerType *T, QualType QT) const override;
380
381 LangAS getASTAllocaAddressSpace() const override {
383 getABIInfo().getDataLayout().getAllocaAddrSpace());
384 }
385
386 LangAS getSRetAddrSpace(const CXXRecordDecl *RD) const override;
387
388 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
389 const VarDecl *D) const override;
390 StringRef getLLVMSyncScopeStr(const LangOptions &LangOpts, SyncScope Scope,
391 llvm::AtomicOrdering Ordering) const override;
392 void setTargetAtomicMetadata(CodeGenFunction &CGF,
393 llvm::Instruction &AtomicInst,
394 const AtomicExpr *Expr = nullptr) const override;
395 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
396 llvm::Function *BlockInvokeFunc,
397 llvm::Type *BlockTy) const override;
398 bool shouldEmitStaticExternCAliases() const override;
399 bool shouldEmitDWARFBitFieldSeparators() const override;
400 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
401};
402}
403
405 llvm::GlobalValue *GV) {
406 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
407 return false;
408
409 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
410 (D->hasAttr<DeviceKernelAttr>() ||
411 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
412 (isa<VarDecl>(D) &&
413 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
414 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
415 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
416}
417
418void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
419 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
420 const auto *ReqdWGS =
421 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
422 const bool IsOpenCLKernel =
423 M.getLangOpts().OpenCL && FD->hasAttr<DeviceKernelAttr>();
424 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
425
426 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
427 if (ReqdWGS || FlatWGS) {
428 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
429 } else if (IsOpenCLKernel || IsHIPKernel) {
430 // By default, restrict the maximum size to a value specified by
431 // --gpu-max-threads-per-block=n or its default value for HIP.
432 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
433 const unsigned DefaultMaxWorkGroupSize =
434 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
435 : M.getLangOpts().GPUMaxThreadsPerBlock;
436 std::string AttrVal =
437 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
438 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
439 }
440
441 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
443
444 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
445 unsigned NumSGPR = Attr->getNumSGPR();
446
447 if (NumSGPR != 0)
448 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
449 }
450
451 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
452 uint32_t NumVGPR = Attr->getNumVGPR();
453
454 if (NumVGPR != 0)
455 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
456 }
457
458 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
459 uint32_t X = Attr->getMaxNumWorkGroupsX()
460 ->EvaluateKnownConstInt(M.getContext())
461 .getExtValue();
462 // Y and Z dimensions default to 1 if not specified
463 uint32_t Y = Attr->getMaxNumWorkGroupsY()
464 ? Attr->getMaxNumWorkGroupsY()
465 ->EvaluateKnownConstInt(M.getContext())
466 .getExtValue()
467 : 1;
468 uint32_t Z = Attr->getMaxNumWorkGroupsZ()
469 ? Attr->getMaxNumWorkGroupsZ()
470 ->EvaluateKnownConstInt(M.getContext())
471 .getExtValue()
472 : 1;
473
474 llvm::SmallString<32> AttrVal;
475 llvm::raw_svector_ostream OS(AttrVal);
476 OS << X << ',' << Y << ',' << Z;
477
478 F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
479 }
480
481 if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
482 auto GetExprVal = [&](const auto &E) {
483 return E ? E->EvaluateKnownConstInt(M.getContext()).getExtValue() : 1;
484 };
485 unsigned X = GetExprVal(Attr->getX());
486 unsigned Y = GetExprVal(Attr->getY());
487 unsigned Z = GetExprVal(Attr->getZ());
488 llvm::SmallString<32> AttrVal;
489 llvm::raw_svector_ostream OS(AttrVal);
490 OS << X << ',' << Y << ',' << Z;
491 F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
492 }
493
494 // OpenCL doesn't support cluster feature.
495 const TargetInfo &TTI = M.getContext().getTargetInfo();
496 if ((IsOpenCLKernel &&
497 TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters")) ||
498 FD->hasAttr<CUDANoClusterAttr>())
499 F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
500}
501
502void AMDGPUTargetCodeGenInfo::setTargetAttributes(
503 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
505 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
506 GV->setDSOLocal(true);
507 }
508
509 if (GV->isDeclaration())
510 return;
511
512 llvm::Function *F = dyn_cast<llvm::Function>(GV);
513 if (!F)
514 return;
515
516 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
517 if (FD)
518 setFunctionDeclAttributes(FD, F, M);
519 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
520 F->addFnAttr("amdgpu-ieee", "false");
521 if (getABIInfo().getCodeGenOpts().AMDGPUExpandWaitcntProfiling)
522 F->addFnAttr("amdgpu-expand-waitcnt-profiling");
523}
524
525unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
526 return llvm::CallingConv::AMDGPU_KERNEL;
527}
528
529// Currently LLVM assumes null pointers always have value 0,
530// which results in incorrectly transformed IR. Therefore, instead of
531// emitting null pointers in private and local address spaces, a null
532// pointer in generic address space is emitted which is casted to a
533// pointer in local or private address space.
534llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
535 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
536 QualType QT) const {
537 if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
538 return llvm::ConstantPointerNull::get(PT);
539
540 auto &Ctx = CGM.getContext();
541 auto NPT = llvm::PointerType::get(
542 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
543 return llvm::ConstantExpr::getAddrSpaceCast(
544 llvm::ConstantPointerNull::get(NPT), PT);
545}
546
547LangAS
548AMDGPUTargetCodeGenInfo::getSRetAddrSpace(const CXXRecordDecl *RD) const {
549 // Types with no viable copy/move must be constructed in-place , use the
550 // default AS so the sret pointer matches the "this" convention.
551 if (RD && !RD->canPassInRegisters())
552 return LangAS::Default;
553 return getASTAllocaAddressSpace();
554}
555
556LangAS
557AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
558 const VarDecl *D) const {
559 assert(!CGM.getLangOpts().OpenCL &&
560 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
561 "Address space agnostic languages only");
562 LangAS DefaultGlobalAS = getLangASFromTargetAS(
563 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
564 if (!D)
565 return DefaultGlobalAS;
566
567 LangAS AddrSpace = D->getType().getAddressSpace();
568 if (AddrSpace != LangAS::Default)
569 return AddrSpace;
570
571 // Only promote to address space 4 if VarDecl has constant initialization.
572 if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
574 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
575 return *ConstAS;
576 }
577 return DefaultGlobalAS;
578}
579
580StringRef AMDGPUTargetCodeGenInfo::getLLVMSyncScopeStr(
581 const LangOptions &LangOpts, SyncScope Scope,
582 llvm::AtomicOrdering Ordering) const {
583
584 // OpenCL assumes by default that atomic scopes are per-address space for
585 // non-sequentially consistent operations.
586 bool IsOneAs = (Scope >= SyncScope::OpenCLWorkGroup &&
587 Scope <= SyncScope::OpenCLSubGroup &&
588 Ordering != llvm::AtomicOrdering::SequentiallyConsistent);
589
590 switch (Scope) {
591 case SyncScope::HIPSingleThread:
592 case SyncScope::SingleScope:
593 return IsOneAs ? "singlethread-one-as" : "singlethread";
594 case SyncScope::HIPWavefront:
595 case SyncScope::OpenCLSubGroup:
596 case SyncScope::WavefrontScope:
597 return IsOneAs ? "wavefront-one-as" : "wavefront";
598 case SyncScope::HIPCluster:
599 case SyncScope::ClusterScope:
600 assert(!IsOneAs && "OpenCL does not have cluster scope");
601 return "cluster";
602 case SyncScope::HIPWorkgroup:
603 case SyncScope::OpenCLWorkGroup:
604 case SyncScope::WorkgroupScope:
605 return IsOneAs ? "workgroup-one-as" : "workgroup";
606 case SyncScope::HIPAgent:
607 case SyncScope::OpenCLDevice:
608 case SyncScope::DeviceScope:
609 return IsOneAs ? "agent-one-as" : "agent";
610 case SyncScope::SystemScope:
611 case SyncScope::HIPSystem:
612 case SyncScope::OpenCLAllSVMDevices:
613 return IsOneAs ? "one-as" : "";
614 }
615 llvm_unreachable("Unknown SyncScope enum");
616}
617
618void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
619 CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
620 const AtomicExpr *AE) const {
621 auto *RMW = dyn_cast<llvm::AtomicRMWInst>(&AtomicInst);
622 auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(&AtomicInst);
623
624 // OpenCL and old style HIP atomics consider atomics targeting thread private
625 // memory to be undefined.
626 //
627 // TODO: This is probably undefined for atomic load/store, but there's not
628 // much direct codegen benefit to knowing this.
629 if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
630 (CmpX &&
631 CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
633 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
634 llvm::MDNode *ASRange = MDHelper.createRange(
635 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
636 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
637 AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange);
638 }
639
640 if (!RMW)
641 return;
642
643 AtomicOptions AO = CGF.CGM.getAtomicOpts();
644 llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
646 RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty);
648 RMW->setMetadata("amdgpu.no.remote.memory", Empty);
650 RMW->getOperation() == llvm::AtomicRMWInst::FAdd &&
651 RMW->getType()->isFloatTy())
652 RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty);
653}
654
655bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
656 return false;
657}
658
659bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
660 return true;
661}
662
663void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
664 const FunctionType *&FT) const {
665 FT = getABIInfo().getContext().adjustFunctionType(
667}
668
669/// Return IR struct type for rtinfo struct in rocm-device-libs used for device
670/// enqueue.
671///
672/// ptr addrspace(1) kernel_object, i32 private_segment_size,
673/// i32 group_segment_size
674
675static llvm::StructType *
676getAMDGPURuntimeHandleType(llvm::LLVMContext &C,
677 llvm::Type *KernelDescriptorPtrTy) {
678 llvm::Type *Int32 = llvm::Type::getInt32Ty(C);
679 return llvm::StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
680 "block.runtime.handle.t");
681}
682
683/// Create an OpenCL kernel for an enqueued block.
684///
685/// The type of the first argument (the block literal) is the struct type
686/// of the block literal instead of a pointer type. The first argument
687/// (block literal) is passed directly by value to the kernel. The kernel
688/// allocates the same type of struct on stack and stores the block literal
689/// to it and passes its pointer to the block invoke function. The kernel
690/// has "enqueued-block" function attribute and kernel argument metadata.
691llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
692 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
693 auto &Builder = CGF.Builder;
694 auto &C = CGF.getLLVMContext();
695
696 auto *InvokeFT = Invoke->getFunctionType();
697 llvm::SmallVector<llvm::Type *, 2> ArgTys;
698 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
699 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
700 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
701 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
702 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
703 llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
704
705 ArgTys.push_back(BlockTy);
706 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
707 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
708 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
709 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
710 AccessQuals.push_back(llvm::MDString::get(C, "none"));
711 ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
712 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
713 ArgTys.push_back(InvokeFT->getParamType(I));
714 ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
715 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
716 AccessQuals.push_back(llvm::MDString::get(C, "none"));
717 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
718 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
719 ArgNames.push_back(
720 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
721 }
722
723 llvm::Module &Mod = CGF.CGM.getModule();
724 const llvm::DataLayout &DL = Mod.getDataLayout();
725
726 llvm::Twine Name = Invoke->getName() + "_kernel";
727 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
728
729 // The kernel itself can be internal, the runtime does not directly access the
730 // kernel address (only the kernel descriptor).
731 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
732 &Mod);
733 F->setCallingConv(getDeviceKernelCallingConv());
734
735 llvm::AttrBuilder KernelAttrs(C);
736 // FIXME: The invoke isn't applying the right attributes either
737 // FIXME: This is missing setTargetAttributes
739 F->addFnAttrs(KernelAttrs);
740
741 auto IP = CGF.Builder.saveIP();
742 auto *BB = llvm::BasicBlock::Create(C, "entry", F);
743 Builder.SetInsertPoint(BB);
744 const auto BlockAlign = DL.getPrefTypeAlign(BlockTy);
745 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
746 BlockPtr->setAlignment(BlockAlign);
747 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
748 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
749 llvm::SmallVector<llvm::Value *, 2> Args;
750 Args.push_back(Cast);
751 for (llvm::Argument &A : llvm::drop_begin(F->args()))
752 Args.push_back(&A);
753 llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
754 call->setCallingConv(Invoke->getCallingConv());
755 Builder.CreateRetVoid();
756 Builder.restoreIP(IP);
757
758 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
759 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
760 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
761 F->setMetadata("kernel_arg_base_type",
762 llvm::MDNode::get(C, ArgBaseTypeNames));
763 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
764 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
765 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
766
767 llvm::StructType *HandleTy = getAMDGPURuntimeHandleType(
768 C, llvm::PointerType::get(C, DL.getDefaultGlobalsAddressSpace()));
769 llvm::Constant *RuntimeHandleInitializer =
770 llvm::ConstantAggregateZero::get(HandleTy);
771
772 llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle";
773
774 // The runtime needs access to the runtime handle as an external symbol. The
775 // runtime handle will need to be made external later, in
776 // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
777 // inside the runtime handle, and is not directly referenced.
778
779 // TODO: We would initialize the first field by declaring F->getName() + ".kd"
780 // to reference the kernel descriptor. The runtime wouldn't need to bother
781 // setting it. We would need to have a final symbol name though.
782 // TODO: Can we directly use an external symbol with getGlobalIdentifier?
783 auto *RuntimeHandle = new llvm::GlobalVariable(
784 Mod, HandleTy,
785 /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
786 /*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName,
787 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
788 DL.getDefaultGlobalsAddressSpace(),
789 /*isExternallyInitialized=*/true);
790
791 llvm::MDNode *HandleAsMD =
792 llvm::MDNode::get(C, llvm::ValueAsMetadata::get(RuntimeHandle));
793 F->setMetadata(llvm::LLVMContext::MD_associated, HandleAsMD);
794
795 RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
796
797 CGF.CGM.addUsedGlobal(F);
798 CGF.CGM.addUsedGlobal(RuntimeHandle);
799 return RuntimeHandle;
800}
801
803 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
804 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
805 int32_t *MaxThreadsVal) {
806 unsigned Min = 0;
807 unsigned Max = 0;
808 auto Eval = [&](Expr *E) {
809 return E->EvaluateKnownConstInt(getContext()).getExtValue();
810 };
811 if (FlatWGS) {
812 Min = Eval(FlatWGS->getMin());
813 Max = Eval(FlatWGS->getMax());
814 }
815 if (ReqdWGS && Min == 0 && Max == 0)
816 Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
817 Eval(ReqdWGS->getZDim());
818
819 if (Min != 0) {
820 assert(Min <= Max && "Min must be less than or equal Max");
821
822 if (MinThreadsVal)
823 *MinThreadsVal = Min;
824 if (MaxThreadsVal)
825 *MaxThreadsVal = Max;
826 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
827 if (F)
828 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
829 } else
830 assert(Max == 0 && "Max must be zero");
831}
832
834 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
835 unsigned Min =
836 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
837 unsigned Max =
838 Attr->getMax()
839 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
840 : 0;
841
842 if (Min != 0) {
843 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
844
845 std::string AttrVal = llvm::utostr(Min);
846 if (Max != 0)
847 AttrVal = AttrVal + "," + llvm::utostr(Max);
848 F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
849 } else
850 assert(Max == 0 && "Max must be zero");
851}
852
853std::unique_ptr<TargetCodeGenInfo>
855 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
856}
static void setCUDAKernelCallingConvention(CanQualType &FTy, CodeGenModule &CGM, const FunctionDecl *FD)
Set calling convention for CUDA/HIP kernel.
Definition CGCall.cpp:362
static bool requiresAMDGPUProtectedVisibility(const Decl *D, llvm::GlobalValue *GV)
Definition AMDGPU.cpp:404
static llvm::StructType * getAMDGPURuntimeHandleType(llvm::LLVMContext &C, llvm::Type *KernelDescriptorPtrTy)
Return IR struct type for rtinfo struct in rocm-device-libs used for device enqueue.
Definition AMDGPU.cpp:676
Defines the C++ Decl subclasses, other than those for templates (found in DeclTemplate....
#define X(type, name)
Definition Value.h:97
const ConstantArrayType * getAsConstantArrayType(QualType T) const
uint64_t getTargetNullPointerValue(QualType QT) const
Get target-dependent integer value for null pointer which is used for constant folding.
uint64_t getTypeSize(QualType T) const
Return the size of the specified (complete) type T, in bits.
const TargetInfo & getTargetInfo() const
Definition ASTContext.h:924
unsigned getTargetAddressSpace(LangAS AS) const
bool threadPrivateMemoryAtomicsAreUndefined() const
Return true if atomics operations targeting allocations in private memory are undefined.
Definition Expr.h:7051
Attr - This represents one attribute.
Definition Attr.h:46
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition CharUnits.h:63
static ABIArgInfo getIgnore()
static ABIArgInfo getDirect(llvm::Type *T=nullptr, unsigned Offset=0, llvm::Type *Padding=nullptr, bool CanBeFlattened=true, unsigned Align=0)
static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace, bool Realign=false, llvm::Type *Padding=nullptr)
Pass this in memory using the IR byref attribute.
@ RAA_DirectInMemory
Pass it on the stack using its defined layout.
Definition CGCXXABI.h:158
unsigned getCallingConvention() const
getCallingConvention - Return the user specified calling convention, which has been translated into a...
CanQualType getReturnType() const
MutableArrayRef< ArgInfo > arguments()
llvm::LLVMContext & getLLVMContext()
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A)
Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to F.
Definition AMDGPU.cpp:833
const LangOptions & getLangOpts() const
const TargetInfo & getTarget() const
void addUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.used metadata.
void handleAMDGPUFlatWorkGroupSizeAttr(llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A, const ReqdWorkGroupSizeAttr *ReqdWGS=nullptr, int32_t *MinThreadsVal=nullptr, int32_t *MaxThreadsVal=nullptr)
Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute to F.
Definition AMDGPU.cpp:802
AtomicOptions getAtomicOpts()
Get the current Atomic options.
ASTContext & getContext() const
const CodeGenOptions & getCodeGenOpts() const
void addDefaultFunctionDefinitionAttributes(llvm::AttrBuilder &attrs)
Like the overload taking a Function &, but intended specifically for frontends that want to build on ...
Definition CGCall.cpp:2311
DefaultABIInfo - The default implementation for ABI specific details.
Definition ABIInfoImpl.h:21
ABIArgInfo classifyArgumentType(QualType RetTy) const
ABIArgInfo classifyReturnType(QualType RetTy) const
Decl - This represents one declaration (or definition), e.g.
Definition DeclBase.h:86
T * getAttr() const
Definition DeclBase.h:581
bool hasAttr() const
Definition DeclBase.h:585
This represents one expression.
Definition Expr.h:112
ExtInfo withCallingConv(CallingConv cc) const
Definition TypeBase.h:4781
ExtInfo getExtInfo() const
Definition TypeBase.h:4914
A (possibly-)qualified type.
Definition TypeBase.h:937
LangAS getAddressSpace() const
Return the address space of this type.
Definition TypeBase.h:8562
bool isConstantStorage(const ASTContext &Ctx, bool ExcludeCtor, bool ExcludeDtor)
Definition TypeBase.h:1036
bool canPassInRegisters() const
Determine whether this class can be passed in registers.
Definition Decl.h:4479
bool hasFlexibleArrayMember() const
Definition Decl.h:4375
field_range fields() const
Definition Decl.h:4545
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition TargetInfo.h:327
virtual std::optional< LangAS > getConstantAddressSpace() const
Return an AST address space which can be used opportunistically for constant global memory.
virtual bool hasFeatureEnabled(const llvm::StringMap< bool > &Features, StringRef Name) const
Check if target has a given feature enabled.
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
RecordDecl * getAsRecordDecl() const
Retrieves the RecordDecl this type refers to.
Definition Type.h:41
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition TypeBase.h:9083
bool isFloatingType() const
Definition Type.cpp:2377
const T * getAs() const
Member-template getAs<specific type>'.
Definition TypeBase.h:9266
QualType getType() const
Definition Decl.h:723
bool hasConstantInitialization() const
Determine whether this variable has constant initialization.
Definition Decl.cpp:2625
ABIArgInfo classifyArgumentType(CodeGenModule &CGM, CanQualType type)
Classify the rules for how to pass a particular type.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
Definition CGValue.h:146
CGCXXABI::RecordArgABI getRecordArgABI(const RecordType *RT, CGCXXABI &CXXABI)
bool classifyReturnType(const CGCXXABI &CXXABI, CGFunctionInfo &FI, const ABIInfo &Info)
std::unique_ptr< TargetCodeGenInfo > createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM)
Definition AMDGPU.cpp:854
RValue emitVoidPtrVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType ValueTy, bool IsIndirect, TypeInfoChars ValueInfo, CharUnits SlotSizeAndAlign, bool AllowHigherAlign, AggValueSlot Slot, bool ForceRightAdjust=false)
Emit va_arg for a platform using the common void* representation, where arguments are simply emitted ...
bool isAggregateTypeForABI(QualType T)
const Type * isSingleElementStruct(QualType T, ASTContext &Context)
isSingleElementStruct - Determine if a structure is a "singleelement struct", i.e.
QualType useFirstFieldIfTransparentUnion(QualType Ty)
Pass transparent unions as if they were the type of the first element.
bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays, bool AsIfNoUniqueAddr=false)
isEmptyRecord - Return true iff a structure contains only empty fields.
@ OS
Indicates that the tracking object is a descendant of a referenced-counted OSObject,...
bool Cast(InterpState &S, CodePtr OpPC)
Definition Interp.h:2701
The JSON file list parser is used to communicate input to InstallAPI.
bool isa(CodeGen::Address addr)
Definition Address.h:330
nullptr
This class represents a compute construct, representing a 'Kind' of ‘parallel’, 'serial',...
@ Type
The name was classified as a type.
Definition Sema.h:564
LangAS
Defines the address space values used by the address space qualifier of QualType.
SyncScope
Defines sync scope values used internally by clang.
Definition SyncScope.h:42
@ CC_DeviceKernel
Definition Specifiers.h:293
U cast(CodeGen::Address addr)
Definition Address.h:327
LangAS getLangASFromTargetAS(unsigned TargetAS)
unsigned long uint64_t
unsigned int uint32_t
bool getOption(AtomicOptionKind Kind) const