clang 22.0.0git
AMDGPU.cpp
Go to the documentation of this file.
1//===- AMDGPU.cpp ---------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ABIInfoImpl.h"
10#include "TargetInfo.h"
11#include "llvm/ADT/StringExtras.h"
12#include "llvm/Support/AMDGPUAddrSpace.h"
13
14using namespace clang;
15using namespace clang::CodeGen;
16
17//===----------------------------------------------------------------------===//
18// AMDGPU ABI Implementation
19//===----------------------------------------------------------------------===//
20
21namespace {
22
23class AMDGPUABIInfo final : public DefaultABIInfo {
24private:
25 static const unsigned MaxNumRegsForArgsRet = 16;
26
27 unsigned numRegsForType(QualType Ty) const;
28
29 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
30 bool isHomogeneousAggregateSmallEnough(const Type *Base,
31 uint64_t Members) const override;
32
33 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
34 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
35 unsigned ToAS) const {
36 // Single value types.
37 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
38 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
39 return llvm::PointerType::get(Ty->getContext(), ToAS);
40 return Ty;
41 }
42
43public:
44 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
45 DefaultABIInfo(CGT) {}
46
47 ABIArgInfo classifyReturnType(QualType RetTy) const;
48 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
49 ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
50 unsigned &NumRegsLeft) const;
51
52 void computeInfo(CGFunctionInfo &FI) const override;
53 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
54 AggValueSlot Slot) const override;
55
56 llvm::FixedVectorType *
57 getOptimalVectorMemoryType(llvm::FixedVectorType *T,
58 const LangOptions &Opt) const override {
59 // We have legal instructions for 96-bit so 3x32 can be supported.
60 // FIXME: This check should be a subtarget feature as technically SI doesn't
61 // support it.
62 if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96)
63 return T;
64 return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt);
65 }
66};
67
68bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
69 return true;
70}
71
72bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
73 const Type *Base, uint64_t Members) const {
74 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
75
76 // Homogeneous Aggregates may occupy at most 16 registers.
77 return Members * NumRegs <= MaxNumRegsForArgsRet;
78}
79
80/// Estimate number of registers the type will use when passed in registers.
81unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
82 unsigned NumRegs = 0;
83
84 if (const VectorType *VT = Ty->getAs<VectorType>()) {
85 // Compute from the number of elements. The reported size is based on the
86 // in-memory size, which includes the padding 4th element for 3-vectors.
87 QualType EltTy = VT->getElementType();
88 unsigned EltSize = getContext().getTypeSize(EltTy);
89
90 // 16-bit element vectors should be passed as packed.
91 if (EltSize == 16)
92 return (VT->getNumElements() + 1) / 2;
93
94 unsigned EltNumRegs = (EltSize + 31) / 32;
95 return EltNumRegs * VT->getNumElements();
96 }
97
98 if (const auto *RD = Ty->getAsRecordDecl()) {
99 assert(!RD->hasFlexibleArrayMember());
100
101 for (const FieldDecl *Field : RD->fields()) {
102 QualType FieldTy = Field->getType();
103 NumRegs += numRegsForType(FieldTy);
104 }
105
106 return NumRegs;
107 }
108
109 return (getContext().getTypeSize(Ty) + 31) / 32;
110}
111
112void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
113 llvm::CallingConv::ID CC = FI.getCallingConvention();
114
115 if (!getCXXABI().classifyReturnType(FI))
117
118 unsigned ArgumentIndex = 0;
119 const unsigned numFixedArguments = FI.getNumRequiredArgs();
120
121 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
122 for (auto &Arg : FI.arguments()) {
123 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
124 Arg.info = classifyKernelArgumentType(Arg.type);
125 } else {
126 bool FixedArgument = ArgumentIndex++ < numFixedArguments;
127 Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
128 }
129 }
130}
131
132RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
133 QualType Ty, AggValueSlot Slot) const {
134 const bool IsIndirect = false;
135 const bool AllowHigherAlign = false;
136 return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
137 getContext().getTypeInfoInChars(Ty),
138 CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
139}
140
141ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
142 if (isAggregateTypeForABI(RetTy)) {
143 // Records with non-trivial destructors/copy-constructors should not be
144 // returned by value.
145 if (!getRecordArgABI(RetTy, getCXXABI())) {
146 // Ignore empty structs/unions.
147 if (isEmptyRecord(getContext(), RetTy, true))
148 return ABIArgInfo::getIgnore();
149
150 // Lower single-element structs to just return a regular value.
151 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
152 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
153
154 if (const auto *RD = RetTy->getAsRecordDecl();
155 RD && RD->hasFlexibleArrayMember())
157
158 // Pack aggregates <= 4 bytes into single VGPR or pair.
159 uint64_t Size = getContext().getTypeSize(RetTy);
160 if (Size <= 16)
161 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
162
163 if (Size <= 32)
164 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
165
166 if (Size <= 64) {
167 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
168 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
169 }
170
171 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
172 return ABIArgInfo::getDirect();
173 }
174 }
175
176 // Otherwise just do the default thing.
178}
179
180/// For kernels all parameters are really passed in a special buffer. It doesn't
181/// make sense to pass anything byval, so everything must be direct.
182ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
184
185 // TODO: Can we omit empty structs?
186
187 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
188 Ty = QualType(SeltTy, 0);
189
190 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
191 llvm::Type *LTy = OrigLTy;
192 if (getContext().getLangOpts().HIP) {
193 LTy = coerceKernelArgumentType(
194 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
195 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
196 }
197
198 // FIXME: This doesn't apply the optimization of coercing pointers in structs
199 // to global address space when using byref. This would require implementing a
200 // new kind of coercion of the in-memory type when for indirect arguments.
201 if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) {
203 getContext().getTypeAlignInChars(Ty),
204 getContext().getTargetAddressSpace(LangAS::opencl_constant),
205 false /*Realign*/, nullptr /*Padding*/);
206 }
207
208 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
209 // individual elements, which confuses the Clover OpenCL backend; therefore we
210 // have to set it to false here. Other args of getDirect() are just defaults.
211 return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
212}
213
214ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
215 unsigned &NumRegsLeft) const {
216 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
217
219
220 if (Variadic) {
221 return ABIArgInfo::getDirect(/*T=*/nullptr,
222 /*Offset=*/0,
223 /*Padding=*/nullptr,
224 /*CanBeFlattened=*/false,
225 /*Align=*/0);
226 }
227
228 if (isAggregateTypeForABI(Ty)) {
229 // Records with non-trivial destructors/copy-constructors should not be
230 // passed by value.
231 if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
232 return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
234
235 // Ignore empty structs/unions.
236 if (isEmptyRecord(getContext(), Ty, true))
237 return ABIArgInfo::getIgnore();
238
239 // Lower single-element structs to just pass a regular value. TODO: We
240 // could do reasonable-size multiple-element structs too, using getExpand(),
241 // though watch out for things like bitfields.
242 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
243 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
244
245 if (const auto *RD = Ty->getAsRecordDecl();
246 RD && RD->hasFlexibleArrayMember())
248
249 // Pack aggregates <= 8 bytes into single VGPR or pair.
250 uint64_t Size = getContext().getTypeSize(Ty);
251 if (Size <= 64) {
252 unsigned NumRegs = (Size + 31) / 32;
253 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
254
255 if (Size <= 16)
256 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
257
258 if (Size <= 32)
259 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
260
261 // XXX: Should this be i64 instead, and should the limit increase?
262 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
263 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
264 }
265
266 if (NumRegsLeft > 0) {
267 unsigned NumRegs = numRegsForType(Ty);
268 if (NumRegsLeft >= NumRegs) {
269 NumRegsLeft -= NumRegs;
270 return ABIArgInfo::getDirect();
271 }
272 }
273
274 // Use pass-by-reference in stead of pass-by-value for struct arguments in
275 // function ABI.
277 getContext().getTypeAlignInChars(Ty),
278 getContext().getTargetAddressSpace(LangAS::opencl_private));
279 }
280
281 // Otherwise just do the default thing.
282 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
283 if (!ArgInfo.isIndirect()) {
284 unsigned NumRegs = numRegsForType(Ty);
285 NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
286 }
287
288 return ArgInfo;
289}
290
291class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
292public:
293 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
294 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
295
296 bool supportsLibCall() const override { return false; }
297 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
298 CodeGenModule &CGM) const;
299
300 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
301 CodeGen::CodeGenModule &M) const override;
302 unsigned getDeviceKernelCallingConv() const override;
303
304 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
305 llvm::PointerType *T, QualType QT) const override;
306
307 LangAS getASTAllocaAddressSpace() const override {
309 getABIInfo().getDataLayout().getAllocaAddrSpace());
310 }
311 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
312 const VarDecl *D) const override;
313 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
314 SyncScope Scope,
315 llvm::AtomicOrdering Ordering,
316 llvm::LLVMContext &Ctx) const override;
317 void setTargetAtomicMetadata(CodeGenFunction &CGF,
318 llvm::Instruction &AtomicInst,
319 const AtomicExpr *Expr = nullptr) const override;
320 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
321 llvm::Function *BlockInvokeFunc,
322 llvm::Type *BlockTy) const override;
323 bool shouldEmitStaticExternCAliases() const override;
324 bool shouldEmitDWARFBitFieldSeparators() const override;
325 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
326};
327}
328
330 llvm::GlobalValue *GV) {
331 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
332 return false;
333
334 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
335 (D->hasAttr<DeviceKernelAttr>() ||
336 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
337 (isa<VarDecl>(D) &&
338 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
339 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
340 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
341}
342
343void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
344 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
345 const auto *ReqdWGS =
346 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
347 const bool IsOpenCLKernel =
348 M.getLangOpts().OpenCL && FD->hasAttr<DeviceKernelAttr>();
349 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
350
351 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
352 if (ReqdWGS || FlatWGS) {
353 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
354 } else if (IsOpenCLKernel || IsHIPKernel) {
355 // By default, restrict the maximum size to a value specified by
356 // --gpu-max-threads-per-block=n or its default value for HIP.
357 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
358 const unsigned DefaultMaxWorkGroupSize =
359 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
360 : M.getLangOpts().GPUMaxThreadsPerBlock;
361 std::string AttrVal =
362 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
363 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
364 }
365
366 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
368
369 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
370 unsigned NumSGPR = Attr->getNumSGPR();
371
372 if (NumSGPR != 0)
373 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
374 }
375
376 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
377 uint32_t NumVGPR = Attr->getNumVGPR();
378
379 if (NumVGPR != 0)
380 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
381 }
382
383 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
384 uint32_t X = Attr->getMaxNumWorkGroupsX()
385 ->EvaluateKnownConstInt(M.getContext())
386 .getExtValue();
387 // Y and Z dimensions default to 1 if not specified
388 uint32_t Y = Attr->getMaxNumWorkGroupsY()
389 ? Attr->getMaxNumWorkGroupsY()
390 ->EvaluateKnownConstInt(M.getContext())
391 .getExtValue()
392 : 1;
393 uint32_t Z = Attr->getMaxNumWorkGroupsZ()
394 ? Attr->getMaxNumWorkGroupsZ()
395 ->EvaluateKnownConstInt(M.getContext())
396 .getExtValue()
397 : 1;
398
399 llvm::SmallString<32> AttrVal;
400 llvm::raw_svector_ostream OS(AttrVal);
401 OS << X << ',' << Y << ',' << Z;
402
403 F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
404 }
405
406 if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
407 auto GetExprVal = [&](const auto &E) {
408 return E ? E->EvaluateKnownConstInt(M.getContext()).getExtValue() : 1;
409 };
410 unsigned X = GetExprVal(Attr->getX());
411 unsigned Y = GetExprVal(Attr->getY());
412 unsigned Z = GetExprVal(Attr->getZ());
413 llvm::SmallString<32> AttrVal;
414 llvm::raw_svector_ostream OS(AttrVal);
415 OS << X << ',' << Y << ',' << Z;
416 F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
417 }
418
419 // OpenCL doesn't support cluster feature.
420 const TargetInfo &TTI = M.getContext().getTargetInfo();
421 if ((IsOpenCLKernel &&
422 TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters")) ||
423 FD->hasAttr<CUDANoClusterAttr>())
424 F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
425}
426
427void AMDGPUTargetCodeGenInfo::setTargetAttributes(
428 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
430 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
431 GV->setDSOLocal(true);
432 }
433
434 if (GV->isDeclaration())
435 return;
436
437 llvm::Function *F = dyn_cast<llvm::Function>(GV);
438 if (!F)
439 return;
440
441 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
442 if (FD) {
443 setFunctionDeclAttributes(FD, F, M);
444 if (FD->hasAttr<DeviceKernelAttr>() && !M.getLangOpts().OpenCL)
445 F->setCallingConv(getDeviceKernelCallingConv());
446 }
447 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
448 F->addFnAttr("amdgpu-ieee", "false");
449}
450
451unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
452 return llvm::CallingConv::AMDGPU_KERNEL;
453}
454
455// Currently LLVM assumes null pointers always have value 0,
456// which results in incorrectly transformed IR. Therefore, instead of
457// emitting null pointers in private and local address spaces, a null
458// pointer in generic address space is emitted which is casted to a
459// pointer in local or private address space.
460llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
461 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
462 QualType QT) const {
463 if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
464 return llvm::ConstantPointerNull::get(PT);
465
466 auto &Ctx = CGM.getContext();
467 auto NPT = llvm::PointerType::get(
468 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
469 return llvm::ConstantExpr::getAddrSpaceCast(
470 llvm::ConstantPointerNull::get(NPT), PT);
471}
472
473LangAS
474AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
475 const VarDecl *D) const {
476 assert(!CGM.getLangOpts().OpenCL &&
477 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
478 "Address space agnostic languages only");
479 LangAS DefaultGlobalAS = getLangASFromTargetAS(
480 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
481 if (!D)
482 return DefaultGlobalAS;
483
484 LangAS AddrSpace = D->getType().getAddressSpace();
485 if (AddrSpace != LangAS::Default)
486 return AddrSpace;
487
488 // Only promote to address space 4 if VarDecl has constant initialization.
489 if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
491 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
492 return *ConstAS;
493 }
494 return DefaultGlobalAS;
495}
496
497llvm::SyncScope::ID
498AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
499 SyncScope Scope,
500 llvm::AtomicOrdering Ordering,
501 llvm::LLVMContext &Ctx) const {
502 std::string Name;
503 switch (Scope) {
504 case SyncScope::HIPSingleThread:
505 case SyncScope::SingleScope:
506 Name = "singlethread";
507 break;
508 case SyncScope::HIPWavefront:
509 case SyncScope::OpenCLSubGroup:
510 case SyncScope::WavefrontScope:
511 Name = "wavefront";
512 break;
513 case SyncScope::HIPCluster:
514 case SyncScope::ClusterScope:
515 Name = "cluster";
516 break;
517 case SyncScope::HIPWorkgroup:
518 case SyncScope::OpenCLWorkGroup:
519 case SyncScope::WorkgroupScope:
520 Name = "workgroup";
521 break;
522 case SyncScope::HIPAgent:
523 case SyncScope::OpenCLDevice:
524 case SyncScope::DeviceScope:
525 Name = "agent";
526 break;
527 case SyncScope::SystemScope:
528 case SyncScope::HIPSystem:
529 case SyncScope::OpenCLAllSVMDevices:
530 Name = "";
531 break;
532 }
533
534 // OpenCL assumes by default that atomic scopes are per-address space for
535 // non-sequentially consistent operations.
536 if (Scope >= SyncScope::OpenCLWorkGroup &&
537 Scope <= SyncScope::OpenCLSubGroup &&
538 Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
539 if (!Name.empty())
540 Name = Twine(Twine(Name) + Twine("-")).str();
541
542 Name = Twine(Twine(Name) + Twine("one-as")).str();
543 }
544
545 return Ctx.getOrInsertSyncScopeID(Name);
546}
547
548void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
549 CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
550 const AtomicExpr *AE) const {
551 auto *RMW = dyn_cast<llvm::AtomicRMWInst>(&AtomicInst);
552 auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(&AtomicInst);
553
554 // OpenCL and old style HIP atomics consider atomics targeting thread private
555 // memory to be undefined.
556 //
557 // TODO: This is probably undefined for atomic load/store, but there's not
558 // much direct codegen benefit to knowing this.
559 if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
560 (CmpX &&
561 CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
563 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
564 llvm::MDNode *ASRange = MDHelper.createRange(
565 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
566 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
567 AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange);
568 }
569
570 if (!RMW)
571 return;
572
573 AtomicOptions AO = CGF.CGM.getAtomicOpts();
574 llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
576 RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty);
578 RMW->setMetadata("amdgpu.no.remote.memory", Empty);
580 RMW->getOperation() == llvm::AtomicRMWInst::FAdd &&
581 RMW->getType()->isFloatTy())
582 RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty);
583}
584
585bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
586 return false;
587}
588
589bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
590 return true;
591}
592
593void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
594 const FunctionType *&FT) const {
595 FT = getABIInfo().getContext().adjustFunctionType(
597}
598
599/// Return IR struct type for rtinfo struct in rocm-device-libs used for device
600/// enqueue.
601///
602/// ptr addrspace(1) kernel_object, i32 private_segment_size,
603/// i32 group_segment_size
604
605static llvm::StructType *
606getAMDGPURuntimeHandleType(llvm::LLVMContext &C,
607 llvm::Type *KernelDescriptorPtrTy) {
608 llvm::Type *Int32 = llvm::Type::getInt32Ty(C);
609 return llvm::StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
610 "block.runtime.handle.t");
611}
612
613/// Create an OpenCL kernel for an enqueued block.
614///
615/// The type of the first argument (the block literal) is the struct type
616/// of the block literal instead of a pointer type. The first argument
617/// (block literal) is passed directly by value to the kernel. The kernel
618/// allocates the same type of struct on stack and stores the block literal
619/// to it and passes its pointer to the block invoke function. The kernel
620/// has "enqueued-block" function attribute and kernel argument metadata.
621llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
622 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
623 auto &Builder = CGF.Builder;
624 auto &C = CGF.getLLVMContext();
625
626 auto *InvokeFT = Invoke->getFunctionType();
627 llvm::SmallVector<llvm::Type *, 2> ArgTys;
628 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
629 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
630 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
631 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
632 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
633 llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
634
635 ArgTys.push_back(BlockTy);
636 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
637 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
638 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
639 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
640 AccessQuals.push_back(llvm::MDString::get(C, "none"));
641 ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
642 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
643 ArgTys.push_back(InvokeFT->getParamType(I));
644 ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
645 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
646 AccessQuals.push_back(llvm::MDString::get(C, "none"));
647 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
648 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
649 ArgNames.push_back(
650 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
651 }
652
653 llvm::Module &Mod = CGF.CGM.getModule();
654 const llvm::DataLayout &DL = Mod.getDataLayout();
655
656 llvm::Twine Name = Invoke->getName() + "_kernel";
657 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
658
659 // The kernel itself can be internal, the runtime does not directly access the
660 // kernel address (only the kernel descriptor).
661 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
662 &Mod);
663 F->setCallingConv(getDeviceKernelCallingConv());
664
665 llvm::AttrBuilder KernelAttrs(C);
666 // FIXME: The invoke isn't applying the right attributes either
667 // FIXME: This is missing setTargetAttributes
669 F->addFnAttrs(KernelAttrs);
670
671 auto IP = CGF.Builder.saveIP();
672 auto *BB = llvm::BasicBlock::Create(C, "entry", F);
673 Builder.SetInsertPoint(BB);
674 const auto BlockAlign = DL.getPrefTypeAlign(BlockTy);
675 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
676 BlockPtr->setAlignment(BlockAlign);
677 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
678 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
679 llvm::SmallVector<llvm::Value *, 2> Args;
680 Args.push_back(Cast);
681 for (llvm::Argument &A : llvm::drop_begin(F->args()))
682 Args.push_back(&A);
683 llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
684 call->setCallingConv(Invoke->getCallingConv());
685 Builder.CreateRetVoid();
686 Builder.restoreIP(IP);
687
688 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
689 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
690 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
691 F->setMetadata("kernel_arg_base_type",
692 llvm::MDNode::get(C, ArgBaseTypeNames));
693 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
694 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
695 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
696
697 llvm::StructType *HandleTy = getAMDGPURuntimeHandleType(
698 C, llvm::PointerType::get(C, DL.getDefaultGlobalsAddressSpace()));
699 llvm::Constant *RuntimeHandleInitializer =
700 llvm::ConstantAggregateZero::get(HandleTy);
701
702 llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle";
703
704 // The runtime needs access to the runtime handle as an external symbol. The
705 // runtime handle will need to be made external later, in
706 // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
707 // inside the runtime handle, and is not directly referenced.
708
709 // TODO: We would initialize the first field by declaring F->getName() + ".kd"
710 // to reference the kernel descriptor. The runtime wouldn't need to bother
711 // setting it. We would need to have a final symbol name though.
712 // TODO: Can we directly use an external symbol with getGlobalIdentifier?
713 auto *RuntimeHandle = new llvm::GlobalVariable(
714 Mod, HandleTy,
715 /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
716 /*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName,
717 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
718 DL.getDefaultGlobalsAddressSpace(),
719 /*isExternallyInitialized=*/true);
720
721 llvm::MDNode *HandleAsMD =
722 llvm::MDNode::get(C, llvm::ValueAsMetadata::get(RuntimeHandle));
723 F->setMetadata(llvm::LLVMContext::MD_associated, HandleAsMD);
724
725 RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
726
727 CGF.CGM.addUsedGlobal(F);
728 CGF.CGM.addUsedGlobal(RuntimeHandle);
729 return RuntimeHandle;
730}
731
733 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
734 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
735 int32_t *MaxThreadsVal) {
736 unsigned Min = 0;
737 unsigned Max = 0;
738 auto Eval = [&](Expr *E) {
739 return E->EvaluateKnownConstInt(getContext()).getExtValue();
740 };
741 if (FlatWGS) {
742 Min = Eval(FlatWGS->getMin());
743 Max = Eval(FlatWGS->getMax());
744 }
745 if (ReqdWGS && Min == 0 && Max == 0)
746 Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
747 Eval(ReqdWGS->getZDim());
748
749 if (Min != 0) {
750 assert(Min <= Max && "Min must be less than or equal Max");
751
752 if (MinThreadsVal)
753 *MinThreadsVal = Min;
754 if (MaxThreadsVal)
755 *MaxThreadsVal = Max;
756 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
757 if (F)
758 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
759 } else
760 assert(Max == 0 && "Max must be zero");
761}
762
764 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
765 unsigned Min =
766 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
767 unsigned Max =
768 Attr->getMax()
769 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
770 : 0;
771
772 if (Min != 0) {
773 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
774
775 std::string AttrVal = llvm::utostr(Min);
776 if (Max != 0)
777 AttrVal = AttrVal + "," + llvm::utostr(Max);
778 F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
779 } else
780 assert(Max == 0 && "Max must be zero");
781}
782
783std::unique_ptr<TargetCodeGenInfo>
785 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
786}
static void setCUDAKernelCallingConvention(CanQualType &FTy, CodeGenModule &CGM, const FunctionDecl *FD)
Set calling convention for CUDA/HIP kernel.
Definition CGCall.cpp:359
static bool requiresAMDGPUProtectedVisibility(const Decl *D, llvm::GlobalValue *GV)
Definition AMDGPU.cpp:329
static llvm::StructType * getAMDGPURuntimeHandleType(llvm::LLVMContext &C, llvm::Type *KernelDescriptorPtrTy)
Return IR struct type for rtinfo struct in rocm-device-libs used for device enqueue.
Definition AMDGPU.cpp:606
#define X(type, name)
Definition Value.h:97
uint64_t getTargetNullPointerValue(QualType QT) const
Get target-dependent integer value for null pointer which is used for constant folding.
const TargetInfo & getTargetInfo() const
Definition ASTContext.h:891
unsigned getTargetAddressSpace(LangAS AS) const
bool threadPrivateMemoryAtomicsAreUndefined() const
Return true if atomics operations targeting allocations in private memory are undefined.
Definition Expr.h:6938
Attr - This represents one attribute.
Definition Attr.h:44
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition CharUnits.h:63
static ABIArgInfo getIgnore()
static ABIArgInfo getDirect(llvm::Type *T=nullptr, unsigned Offset=0, llvm::Type *Padding=nullptr, bool CanBeFlattened=true, unsigned Align=0)
static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace, bool Realign=false, llvm::Type *Padding=nullptr)
Pass this in memory using the IR byref attribute.
@ RAA_DirectInMemory
Pass it on the stack using its defined layout.
Definition CGCXXABI.h:158
unsigned getCallingConvention() const
getCallingConvention - Return the user specified calling convention, which has been translated into a...
CanQualType getReturnType() const
MutableArrayRef< ArgInfo > arguments()
llvm::LLVMContext & getLLVMContext()
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A)
Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to F.
Definition AMDGPU.cpp:763
const LangOptions & getLangOpts() const
const TargetInfo & getTarget() const
void addUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.used metadata.
void handleAMDGPUFlatWorkGroupSizeAttr(llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A, const ReqdWorkGroupSizeAttr *ReqdWGS=nullptr, int32_t *MinThreadsVal=nullptr, int32_t *MaxThreadsVal=nullptr)
Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute to F.
Definition AMDGPU.cpp:732
AtomicOptions getAtomicOpts()
Get the current Atomic options.
ASTContext & getContext() const
const CodeGenOptions & getCodeGenOpts() const
void addDefaultFunctionDefinitionAttributes(llvm::AttrBuilder &attrs)
Like the overload taking a Function &, but intended specifically for frontends that want to build on ...
Definition CGCall.cpp:2228
DefaultABIInfo - The default implementation for ABI specific details.
Definition ABIInfoImpl.h:21
ABIArgInfo classifyArgumentType(QualType RetTy) const
ABIArgInfo classifyReturnType(QualType RetTy) const
Decl - This represents one declaration (or definition), e.g.
Definition DeclBase.h:86
T * getAttr() const
Definition DeclBase.h:573
bool hasAttr() const
Definition DeclBase.h:577
This represents one expression.
Definition Expr.h:112
ExtInfo withCallingConv(CallingConv cc) const
Definition TypeBase.h:4673
ExtInfo getExtInfo() const
Definition TypeBase.h:4806
A (possibly-)qualified type.
Definition TypeBase.h:937
LangAS getAddressSpace() const
Return the address space of this type.
Definition TypeBase.h:8404
bool isConstantStorage(const ASTContext &Ctx, bool ExcludeCtor, bool ExcludeDtor)
Definition TypeBase.h:1036
bool hasFlexibleArrayMember() const
Definition Decl.h:4345
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition TargetInfo.h:323
virtual std::optional< LangAS > getConstantAddressSpace() const
Return an AST address space which can be used opportunistically for constant global memory.
virtual bool hasFeatureEnabled(const llvm::StringMap< bool > &Features, StringRef Name) const
Check if target has a given feature enabled.
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
RecordDecl * getAsRecordDecl() const
Retrieves the RecordDecl this type refers to.
Definition Type.h:41
const T * getAs() const
Member-template getAs<specific type>'.
Definition TypeBase.h:9091
QualType getType() const
Definition Decl.h:723
bool hasConstantInitialization() const
Determine whether this variable has constant initialization.
Definition Decl.cpp:2648
ABIArgInfo classifyArgumentType(CodeGenModule &CGM, CanQualType type)
Classify the rules for how to pass a particular type.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
Definition CGValue.h:145
CGCXXABI::RecordArgABI getRecordArgABI(const RecordType *RT, CGCXXABI &CXXABI)
bool classifyReturnType(const CGCXXABI &CXXABI, CGFunctionInfo &FI, const ABIInfo &Info)
std::unique_ptr< TargetCodeGenInfo > createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM)
Definition AMDGPU.cpp:784
RValue emitVoidPtrVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType ValueTy, bool IsIndirect, TypeInfoChars ValueInfo, CharUnits SlotSizeAndAlign, bool AllowHigherAlign, AggValueSlot Slot, bool ForceRightAdjust=false)
Emit va_arg for a platform using the common void* representation, where arguments are simply emitted ...
bool isAggregateTypeForABI(QualType T)
const Type * isSingleElementStruct(QualType T, ASTContext &Context)
isSingleElementStruct - Determine if a structure is a "singleelement struct", i.e.
QualType useFirstFieldIfTransparentUnion(QualType Ty)
Pass transparent unions as if they were the type of the first element.
bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays, bool AsIfNoUniqueAddr=false)
isEmptyRecord - Return true iff a structure contains only empty fields.
@ OS
Indicates that the tracking object is a descendant of a referenced-counted OSObject,...
bool Cast(InterpState &S, CodePtr OpPC)
Definition Interp.h:2476
The JSON file list parser is used to communicate input to InstallAPI.
bool isa(CodeGen::Address addr)
Definition Address.h:330
nullptr
This class represents a compute construct, representing a 'Kind' of ‘parallel’, 'serial',...
const FunctionProtoType * T
@ Type
The name was classified as a type.
Definition Sema.h:562
LangAS
Defines the address space values used by the address space qualifier of QualType.
SyncScope
Defines sync scope values used internally by clang.
Definition SyncScope.h:42
@ CC_DeviceKernel
Definition Specifiers.h:292
U cast(CodeGen::Address addr)
Definition Address.h:327
LangAS getLangASFromTargetAS(unsigned TargetAS)
unsigned long uint64_t
unsigned int uint32_t
bool getOption(AtomicOptionKind Kind) const