clang 23.0.0git
CIRGenCUDANV.cpp
Go to the documentation of this file.
1//========- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----=========//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10// runtime library.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CIRGenCUDARuntime.h"
15#include "CIRGenFunction.h"
16#include "CIRGenModule.h"
17#include "mlir/IR/Operation.h"
19#include "clang/AST/Decl.h"
22#include "clang/Basic/Cuda.h"
25#include "llvm/Support/Casting.h"
26
27using namespace clang;
28using namespace clang::CIRGen;
29
30namespace {
31
32class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
33protected:
34 StringRef prefix;
35
36 // Map a device stub function to a symbol for identifying kernel in host
37 // code. For CUDA, the symbol for identifying the kernel is the same as the
38 // device stub function. For HIP, they are different.
39 llvm::StringMap<mlir::Operation *> kernelHandles;
40
41 // Map a kernel handle to the kernel stub.
42 llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs;
43 // Mangle context for device.
44 std::unique_ptr<MangleContext> deviceMC;
45
46private:
47 void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
48 FunctionArgList &args);
49 mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc,
50 FunctionArgList &args);
51 mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override;
52
53 mlir::Operation *getKernelStub(mlir::Operation *handle) override {
54 auto it = kernelStubs.find(handle);
55 assert(it != kernelStubs.end());
56 return it->second;
57 }
58 std::string addPrefixToName(StringRef funcName) const;
59 std::string addUnderscoredPrefixToName(StringRef funcName) const;
60
61public:
62 CIRGenNVCUDARuntime(CIRGenModule &cgm);
63 ~CIRGenNVCUDARuntime();
64
65 void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
66 FunctionArgList &args) override;
67};
68
69} // namespace
70
71std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const {
72 return (prefix + funcName).str();
73}
74
75std::string
76CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const {
77 return ("__" + prefix + funcName).str();
78}
79
80CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
81 : CIRGenCUDARuntime(cgm),
82 deviceMC(cgm.getASTContext().cudaNVInitDeviceMC()) {
83 if (cgm.getLangOpts().OffloadViaLLVM)
84 cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM");
85 else if (cgm.getLangOpts().HIP)
86 prefix = "hip";
87 else
88 prefix = "cuda";
89}
90
91mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf,
92 mlir::Location loc,
93 FunctionArgList &args) {
94 CIRGenBuilderTy &builder = cgm.getBuilder();
95
96 // Build void *args[] and populate with the addresses of kernel arguments.
97 auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size());
98 mlir::Value kernelArgs = builder.createAlloca(
99 loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, "kernel_args",
101
102 mlir::Value kernelArgsDecayed =
103 builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
104 cir::PointerType::get(cgm.voidPtrTy));
105
106 for (const auto &[i, arg] : llvm::enumerate(args)) {
107 mlir::Value index =
108 builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
109 mlir::Value storePos =
110 builder.createPtrStride(loc, kernelArgsDecayed, index);
111 mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer();
112 mlir::Value argAsVoid = builder.createBitcast(argAddr, cgm.voidPtrTy);
113
114 builder.CIRBaseBuilderTy::createStore(loc, argAsVoid, storePos);
115 }
116
117 return kernelArgsDecayed;
118}
119
120// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
121// array and kernels are launched using cudaLaunchKernel().
122void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
123 cir::FuncOp fn,
124 FunctionArgList &args) {
125
126 // This requires arguments to be sent to kernels in a different way.
127 if (cgm.getLangOpts().OffloadViaLLVM)
128 cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM");
129
130 CIRGenBuilderTy &builder = cgm.getBuilder();
131 mlir::Location loc = fn.getLoc();
132
133 // For [cuda|hip]LaunchKernel, we must add another layer of indirection
134 // to arguments. For example, for function `add(int a, float b)`,
135 // we need to pass it as `void *args[2] = { &a, &b }`.
136 mlir::Value kernelArgs = prepareKernelArgs(cgf, loc, args);
137
138 // Lookup cudaLaunchKernel/hipLaunchKernel function.
139 // HIP kernel launching API name depends on -fgpu-default-stream option. For
140 // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
141 // it is hipLaunchKernel_spt.
142 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
143 // void **args, size_t sharedMem,
144 // cudaStream_t stream);
145 // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
146 // dim3 blockDim, void **args,
147 // size_t sharedMem, hipStream_t stream);
148 TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl();
149 DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl);
150
151 // The default stream is usually stream 0 (the legacy default stream).
152 // For per-thread default stream, we need a different LaunchKernel function.
153 std::string kernelLaunchAPI = "LaunchKernel";
154 if (cgm.getLangOpts().GPUDefaultStream ==
155 LangOptions::GPUDefaultStreamKind::PerThread) {
156 if (cgm.getLangOpts().HIP)
157 kernelLaunchAPI += "_spt";
158 else if (cgm.getLangOpts().CUDA)
159 kernelLaunchAPI += "_ptsz";
160 }
161
162 std::string launchKernelName = addPrefixToName(kernelLaunchAPI);
163 const IdentifierInfo &launchII =
164 cgm.getASTContext().Idents.get(launchKernelName);
165 FunctionDecl *cudaLaunchKernelFD = nullptr;
166 for (NamedDecl *result : dc->lookup(&launchII)) {
167 if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result))
168 cudaLaunchKernelFD = fd;
169 }
170
171 if (cudaLaunchKernelFD == nullptr) {
172 cgm.error(cgf.curFuncDecl->getLocation(),
173 "Can't find declaration for " + launchKernelName);
174 return;
175 }
176
177 // Use this function to retrieve arguments for cudaLaunchKernel:
178 // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
179 // *sharedMem, cudaStream_t *stream)
180 //
181 // Here [cuda|hip]Stream_t, while also being the 6th argument of
182 // [cuda|hip]LaunchKernel, is a pointer to some opaque struct.
183
184 mlir::Type dim3Ty = cgf.getTypes().convertType(
185 cudaLaunchKernelFD->getParamDecl(1)->getType());
186 mlir::Type streamTy = cgf.getTypes().convertType(
187 cudaLaunchKernelFD->getParamDecl(5)->getType());
188
189 mlir::Value gridDim =
190 builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
191 "grid_dim", CharUnits::fromQuantity(8));
192 mlir::Value blockDim =
193 builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
194 "block_dim", CharUnits::fromQuantity(8));
195 mlir::Value sharedMem =
196 builder.createAlloca(loc, cir::PointerType::get(cgm.sizeTy), cgm.sizeTy,
197 "shared_mem", cgm.getSizeAlign());
198 mlir::Value stream =
199 builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy,
200 "stream", cgm.getPointerAlign());
201
202 cir::FuncOp popConfig = cgm.createRuntimeFunction(
203 cir::FuncType::get({gridDim.getType(), blockDim.getType(),
204 sharedMem.getType(), stream.getType()},
205 cgm.sInt32Ty),
206 addUnderscoredPrefixToName("PopCallConfiguration"));
207 cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});
208
209 // Now emit the call to cudaLaunchKernel
210 // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
211 // dim3 blockDim,
212 // void **args, size_t sharedMem,
213 // [cuda|hip]Stream_t stream);
214
215 // We now either pick the function or the stub global for cuda, hip
216 // respectively.
217 mlir::Value kernel = [&]() -> mlir::Value {
218 if (cir::GlobalOp globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
219 kernelHandles[fn.getSymName()])) {
220 cir::PointerType kernelTy = cir::PointerType::get(globalOp.getSymType());
221 mlir::Value kernelVal = cir::GetGlobalOp::create(builder, loc, kernelTy,
222 globalOp.getSymName());
223 mlir::Value func = builder.createBitcast(kernelVal, cgm.voidPtrTy);
224 return func;
225 }
226 if (cir::FuncOp funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
227 kernelHandles[fn.getSymName()])) {
228 cir::PointerType kernelTy =
229 cir::PointerType::get(funcOp.getFunctionType());
230 mlir::Value kernelVal =
231 cir::GetGlobalOp::create(builder, loc, kernelTy, funcOp.getSymName());
232 mlir::Value func = builder.createBitcast(kernelVal, cgm.voidPtrTy);
233 return func;
234 }
235 llvm_unreachable("Expected stub handle to be cir::GlobalOp or FuncOp");
236 }();
237
238 CallArgList launchArgs;
239 launchArgs.add(RValue::get(kernel),
240 cudaLaunchKernelFD->getParamDecl(0)->getType());
241 launchArgs.add(
243 cudaLaunchKernelFD->getParamDecl(1)->getType());
244 launchArgs.add(
246 cudaLaunchKernelFD->getParamDecl(2)->getType());
247 launchArgs.add(RValue::get(kernelArgs),
248 cudaLaunchKernelFD->getParamDecl(3)->getType());
249 launchArgs.add(
250 RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)),
251 cudaLaunchKernelFD->getParamDecl(4)->getType());
252 launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, stream)),
253 cudaLaunchKernelFD->getParamDecl(5)->getType());
254
255 mlir::Type launchTy =
256 cgm.getTypes().convertType(cudaLaunchKernelFD->getType());
257 mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction(
258 cast<cir::FuncType>(launchTy), launchKernelName);
259 const CIRGenFunctionInfo &callInfo =
260 cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
261 cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn),
262 ReturnValueSlot(), launchArgs);
263
265 !cgf.getLangOpts().HIP)
266 cgm.errorNYI("MSVC CUDA stub handling");
267}
268
269void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
270 FunctionArgList &args) {
271
272 if (auto globalOp =
273 llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) {
274 CIRGenBuilderTy &builder = cgm.getBuilder();
275 mlir::Type fnPtrTy = globalOp.getSymType();
276 auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
277 auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym);
278
279 globalOp->setAttr("initial_value", gv);
280 globalOp->removeAttr("sym_visibility");
281 globalOp->setAttr("alignment", builder.getI64IntegerAttr(
283 }
284
285 // CUDA 9.0 changed the way to launch kernels.
287 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
288 (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
289 cgm.getLangOpts().OffloadViaLLVM)
290 emitDeviceStubBodyNew(cgf, fn, args);
291 else
292 cgm.errorNYI("Emit Stub Body Legacy");
293}
294
296 return new CIRGenNVCUDARuntime(cgm);
297}
298
299CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {}
300
301mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
302 GlobalDecl gd) {
303
304 // Check if we already have a kernel handle for this function
305 auto it = kernelHandles.find(fn.getSymName());
306 if (it != kernelHandles.end()) {
307 mlir::Operation *oldHandle = it->second;
308 // Here we know that the fn did not change. Return it
309 if (kernelStubs[oldHandle] == fn)
310 return oldHandle;
311
312 // We've found the function name, but F itself has changed, so we need to
313 // update the references.
314 if (cgm.getLangOpts().HIP) {
315 // For HIP compilation the handle itself does not change, so we only need
316 // to update the Stub value.
317 kernelStubs[oldHandle] = fn;
318 return oldHandle;
319 }
320 // For non-HIP compilation, erase the old Stub and fall-through to creating
321 // new entries.
322 kernelStubs.erase(oldHandle);
323 }
324
325 // If not targeting HIP, store the function itself
326 if (!cgm.getLangOpts().HIP) {
327 kernelHandles[fn.getSymName()] = fn;
328 kernelStubs[fn] = fn;
329 return fn;
330 }
331
332 // Create a new CIR global variable to represent the kernel handle
333 CIRGenBuilderTy &builder = cgm.getBuilder();
334 StringRef globalName = cgm.getMangledName(
335 gd.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
336 cir::GlobalOp globalOp = CIRGenModule::createGlobalOp(
337 cgm, fn.getLoc(), globalName, fn.getFunctionType(),
338 /*isConstant=*/true);
339
340 globalOp->setAttr("alignment", builder.getI64IntegerAttr(
342
343 // Store references
344 kernelHandles[fn.getSymName()] = globalOp;
345 kernelStubs[globalOp] = fn;
346
347 return globalOp;
348}
Defines the clang::ASTContext interface.
Provides definitions for the various language-specific address spaces.
__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim
__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim
mlir::Value createCast(mlir::Location loc, cir::CastKind kind, mlir::Value src, mlir::Type newTy)
cir::PtrStrideOp createPtrStride(mlir::Location loc, mlir::Value base, mlir::Value stride)
mlir::Value createBitcast(mlir::Value src, mlir::Type newTy)
mlir::Value createAlloca(mlir::Location loc, cir::PointerType addrType, mlir::Type type, llvm::StringRef name, mlir::IntegerAttr alignment, mlir::Value dynAllocSize)
TranslationUnitDecl * getTranslationUnitDecl() const
IdentifierTable & Idents
Definition ASTContext.h:798
const TargetInfo & getTargetInfo() const
Definition ASTContext.h:917
mlir::Value getPointer() const
Definition Address.h:96
cir::ConstantOp getConstInt(mlir::Location loc, llvm::APSInt intVal)
static CIRGenCallee forDirect(mlir::Operation *funcPtr, const CIRGenCalleeInfo &abstractInfo=CIRGenCalleeInfo())
Definition CIRGenCall.h:92
CIRGenTypes & getTypes() const
const clang::LangOptions & getLangOpts() const
const clang::Decl * curFuncDecl
Address getAddrOfLocalVar(const clang::VarDecl *vd)
Return the address of a local variable.
RValue emitCall(const CIRGenFunctionInfo &funcInfo, const CIRGenCallee &callee, ReturnValueSlot returnValue, const CallArgList &args, cir::CIRCallOpInterface *callOp, mlir::Location loc)
mlir::Value emitRuntimeCall(mlir::Location loc, cir::FuncOp callee, llvm::ArrayRef< mlir::Value > args={}, mlir::NamedAttrList attrs={})
This class organizes the cross-function state that is used while generating CIR code.
static cir::GlobalOp createGlobalOp(CIRGenModule &cgm, mlir::Location loc, llvm::StringRef name, mlir::Type t, bool isConstant=false, mlir::ptr::MemorySpaceAttrInterface addrSpace={}, mlir::Operation *insertPoint=nullptr)
llvm::StringRef getMangledName(clang::GlobalDecl gd)
DiagnosticBuilder errorNYI(SourceLocation, llvm::StringRef)
Helpers to emit "not yet implemented" error diagnostics.
clang::ASTContext & getASTContext() const
CIRGenBuilderTy & getBuilder()
const clang::TargetInfo & getTarget() const
void error(SourceLocation loc, llvm::StringRef error)
Emit a general error that something can't be done.
cir::FuncOp createRuntimeFunction(cir::FuncType ty, llvm::StringRef name, mlir::NamedAttrList extraAttrs={}, bool isLocal=false, bool assumeConvergent=false)
const clang::LangOptions & getLangOpts() const
const CIRGenFunctionInfo & arrangeFunctionDeclaration(const clang::FunctionDecl *fd)
Free functions are functions that are compatible with an ordinary C function pointer type.
mlir::Type convertType(clang::QualType type)
Convert a Clang type into a mlir::Type.
void add(RValue rvalue, clang::QualType type)
Definition CIRGenCall.h:239
Type for representing both the decl and type of parameters to a function.
Definition CIRGenCall.h:193
static RValue get(mlir::Value v)
Definition CIRGenValue.h:83
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition CharUnits.h:185
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition CharUnits.h:63
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
SourceLocation getLocation() const
Definition DeclBase.h:447
const ParmVarDecl * getParamDecl(unsigned i) const
Definition Decl.h:2812
GlobalDecl - represents a global declaration.
Definition GlobalDecl.h:57
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
Definition GlobalDecl.h:203
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
const llvm::VersionTuple & getSDKVersion() const
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
Definition Decl.h:151
QualType getType() const
Definition Decl.h:723
CIRGenCUDARuntime * createNVCUDARuntime(CIRGenModule &cgm)
@ Address
A pointer to a ValueDecl.
Definition Primitives.h:28
The JSON file list parser is used to communicate input to InstallAPI.
if(T->getSizeExpr()) TRY_TO(TraverseStmt(const_cast< Expr * >(T -> getSizeExpr())))
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Definition Cuda.cpp:163
U cast(CodeGen::Address addr)
Definition Address.h:327
clang::CharUnits getPointerAlign() const
clang::CharUnits getSizeAlign() const
cir::PointerType voidPtrTy
void* in address space 0