clang 23.0.0git
CIRGenCUDANV.cpp
Go to the documentation of this file.
1//========- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----=========//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10// runtime library.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CIRGenCUDARuntime.h"
15#include "CIRGenFunction.h"
16#include "CIRGenModule.h"
17#include "mlir/IR/Operation.h"
19#include "clang/AST/Decl.h"
22#include "clang/Basic/Cuda.h"
25#include "llvm/Support/Casting.h"
26
27using namespace clang;
28using namespace clang::CIRGen;
29
30namespace {
31
32class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
33protected:
34 StringRef prefix;
35
36 // Map a device stub function to a symbol for identifying kernel in host
37 // code. For CUDA, the symbol for identifying the kernel is the same as the
38 // device stub function. For HIP, they are different.
39 llvm::StringMap<mlir::Operation *> kernelHandles;
40
41 // Map a kernel handle to the kernel stub.
42 llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs;
43 // Mangle context for device.
44 std::unique_ptr<MangleContext> deviceMC;
45
46private:
47 void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
48 FunctionArgList &args);
49 mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc,
50 FunctionArgList &args);
51 mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override;
52
53 mlir::Operation *getKernelStub(mlir::Operation *handle) override {
54 auto it = kernelStubs.find(handle);
55 assert(it != kernelStubs.end());
56 return it->second;
57 }
58 std::string addPrefixToName(StringRef funcName) const;
59 std::string addUnderscoredPrefixToName(StringRef funcName) const;
60
61public:
62 CIRGenNVCUDARuntime(CIRGenModule &cgm);
63 ~CIRGenNVCUDARuntime();
64
65 void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
66 FunctionArgList &args) override;
67};
68
69} // namespace
70
71std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const {
72 return (prefix + funcName).str();
73}
74
75std::string
76CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const {
77 return ("__" + prefix + funcName).str();
78}
79
80CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
81 : CIRGenCUDARuntime(cgm),
82 deviceMC(cgm.getASTContext().cudaNVInitDeviceMC()) {
83 if (cgm.getLangOpts().OffloadViaLLVM)
84 cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM");
85 else if (cgm.getLangOpts().HIP)
86 prefix = "hip";
87 else
88 prefix = "cuda";
89}
90
91mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf,
92 mlir::Location loc,
93 FunctionArgList &args) {
94 CIRGenBuilderTy &builder = cgm.getBuilder();
95
96 // Build void *args[] and populate with the addresses of kernel arguments.
97 auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size());
98 mlir::Value kernelArgs = builder.createAlloca(
99 loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, "kernel_args",
101
102 mlir::Value kernelArgsDecayed =
103 builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
104 cir::PointerType::get(cgm.voidPtrTy));
105
106 for (const auto &[i, arg] : llvm::enumerate(args)) {
107 mlir::Value index =
108 builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
109 mlir::Value storePos =
110 builder.createPtrStride(loc, kernelArgsDecayed, index);
111 mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer();
112 mlir::Value argAsVoid = builder.createBitcast(argAddr, cgm.voidPtrTy);
113
114 builder.CIRBaseBuilderTy::createStore(loc, argAsVoid, storePos);
115 }
116
117 return kernelArgsDecayed;
118}
119
120// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
121// array and kernels are launched using cudaLaunchKernel().
122void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
123 cir::FuncOp fn,
124 FunctionArgList &args) {
125
126 // This requires arguments to be sent to kernels in a different way.
127 if (cgm.getLangOpts().OffloadViaLLVM)
128 cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM");
129
130 CIRGenBuilderTy &builder = cgm.getBuilder();
131 mlir::Location loc = fn.getLoc();
132
133 // For [cuda|hip]LaunchKernel, we must add another layer of indirection
134 // to arguments. For example, for function `add(int a, float b)`,
135 // we need to pass it as `void *args[2] = { &a, &b }`.
136 mlir::Value kernelArgs = prepareKernelArgs(cgf, loc, args);
137
138 // Lookup cudaLaunchKernel/hipLaunchKernel function.
139 // HIP kernel launching API name depends on -fgpu-default-stream option. For
140 // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
141 // it is hipLaunchKernel_spt.
142 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
143 // void **args, size_t sharedMem,
144 // cudaStream_t stream);
145 // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
146 // dim3 blockDim, void **args,
147 // size_t sharedMem, hipStream_t stream);
148 TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl();
149 DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl);
150
151 // The default stream is usually stream 0 (the legacy default stream).
152 // For per-thread default stream, we need a different LaunchKernel function.
153 StringRef kernelLaunchAPI = "LaunchKernel";
154 if (cgm.getLangOpts().GPUDefaultStream ==
155 LangOptions::GPUDefaultStreamKind::PerThread)
156 cgm.errorNYI("CUDA/HIP Stream per thread");
157
158 std::string launchKernelName = addPrefixToName(kernelLaunchAPI);
159 const IdentifierInfo &launchII =
160 cgm.getASTContext().Idents.get(launchKernelName);
161 FunctionDecl *cudaLaunchKernelFD = nullptr;
162 for (NamedDecl *result : dc->lookup(&launchII)) {
163 if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result))
164 cudaLaunchKernelFD = fd;
165 }
166
167 if (cudaLaunchKernelFD == nullptr) {
168 cgm.error(cgf.curFuncDecl->getLocation(),
169 "Can't find declaration for " + launchKernelName);
170 return;
171 }
172
173 // Use this function to retrieve arguments for cudaLaunchKernel:
174 // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
175 // *sharedMem, cudaStream_t *stream)
176 //
177 // Here [cuda|hip]Stream_t, while also being the 6th argument of
178 // [cuda|hip]LaunchKernel, is a pointer to some opaque struct.
179
180 mlir::Type dim3Ty = cgf.getTypes().convertType(
181 cudaLaunchKernelFD->getParamDecl(1)->getType());
182 mlir::Type streamTy = cgf.getTypes().convertType(
183 cudaLaunchKernelFD->getParamDecl(5)->getType());
184
185 mlir::Value gridDim =
186 builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
187 "grid_dim", CharUnits::fromQuantity(8));
188 mlir::Value blockDim =
189 builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
190 "block_dim", CharUnits::fromQuantity(8));
191 mlir::Value sharedMem =
192 builder.createAlloca(loc, cir::PointerType::get(cgm.sizeTy), cgm.sizeTy,
193 "shared_mem", cgm.getSizeAlign());
194 mlir::Value stream =
195 builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy,
196 "stream", cgm.getPointerAlign());
197
198 cir::FuncOp popConfig = cgm.createRuntimeFunction(
199 cir::FuncType::get({gridDim.getType(), blockDim.getType(),
200 sharedMem.getType(), stream.getType()},
201 cgm.sInt32Ty),
202 addUnderscoredPrefixToName("PopCallConfiguration"));
203 cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});
204
205 // Now emit the call to cudaLaunchKernel
206 // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
207 // dim3 blockDim,
208 // void **args, size_t sharedMem,
209 // [cuda|hip]Stream_t stream);
210
211 // We now either pick the function or the stub global for cuda, hip
212 // respectively.
213 mlir::Value kernel = [&]() -> mlir::Value {
214 if (cir::GlobalOp globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
215 kernelHandles[fn.getSymName()])) {
216 cir::PointerType kernelTy = cir::PointerType::get(globalOp.getSymType());
217 mlir::Value kernelVal = cir::GetGlobalOp::create(builder, loc, kernelTy,
218 globalOp.getSymName());
219 mlir::Value func = builder.createBitcast(kernelVal, cgm.voidPtrTy);
220 return func;
221 }
222 if (cir::FuncOp funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
223 kernelHandles[fn.getSymName()])) {
224 cir::PointerType kernelTy =
225 cir::PointerType::get(funcOp.getFunctionType());
226 mlir::Value kernelVal =
227 cir::GetGlobalOp::create(builder, loc, kernelTy, funcOp.getSymName());
228 mlir::Value func = builder.createBitcast(kernelVal, cgm.voidPtrTy);
229 return func;
230 }
231 llvm_unreachable("Expected stub handle to be cir::GlobalOp or FuncOp");
232 }();
233
234 CallArgList launchArgs;
235 launchArgs.add(RValue::get(kernel),
236 cudaLaunchKernelFD->getParamDecl(0)->getType());
237 launchArgs.add(
239 cudaLaunchKernelFD->getParamDecl(1)->getType());
240 launchArgs.add(
242 cudaLaunchKernelFD->getParamDecl(2)->getType());
243 launchArgs.add(RValue::get(kernelArgs),
244 cudaLaunchKernelFD->getParamDecl(3)->getType());
245 launchArgs.add(
246 RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)),
247 cudaLaunchKernelFD->getParamDecl(4)->getType());
248 launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, stream)),
249 cudaLaunchKernelFD->getParamDecl(5)->getType());
250
251 mlir::Type launchTy =
252 cgm.getTypes().convertType(cudaLaunchKernelFD->getType());
253 mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction(
254 cast<cir::FuncType>(launchTy), launchKernelName);
255 const CIRGenFunctionInfo &callInfo =
256 cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
257 cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn),
258 ReturnValueSlot(), launchArgs);
259
261 !cgf.getLangOpts().HIP)
262 cgm.errorNYI("MSVC CUDA stub handling");
263}
264
265void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
266 FunctionArgList &args) {
267
268 if (auto globalOp =
269 llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) {
270 CIRGenBuilderTy &builder = cgm.getBuilder();
271 mlir::Type fnPtrTy = globalOp.getSymType();
272 auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
273 auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym);
274
275 globalOp->setAttr("initial_value", gv);
276 globalOp->removeAttr("sym_visibility");
277 globalOp->setAttr("alignment", builder.getI64IntegerAttr(
279 }
280
281 // CUDA 9.0 changed the way to launch kernels.
283 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
284 (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
285 cgm.getLangOpts().OffloadViaLLVM)
286 emitDeviceStubBodyNew(cgf, fn, args);
287 else
288 cgm.errorNYI("Emit Stub Body Legacy");
289}
290
292 return new CIRGenNVCUDARuntime(cgm);
293}
294
295CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {}
296
297mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
298 GlobalDecl gd) {
299
300 // Check if we already have a kernel handle for this function
301 auto it = kernelHandles.find(fn.getSymName());
302 if (it != kernelHandles.end()) {
303 mlir::Operation *oldHandle = it->second;
304 // Here we know that the fn did not change. Return it
305 if (kernelStubs[oldHandle] == fn)
306 return oldHandle;
307
308 // We've found the function name, but F itself has changed, so we need to
309 // update the references.
310 if (cgm.getLangOpts().HIP) {
311 // For HIP compilation the handle itself does not change, so we only need
312 // to update the Stub value.
313 kernelStubs[oldHandle] = fn;
314 return oldHandle;
315 }
316 // For non-HIP compilation, erase the old Stub and fall-through to creating
317 // new entries.
318 kernelStubs.erase(oldHandle);
319 }
320
321 // If not targeting HIP, store the function itself
322 if (!cgm.getLangOpts().HIP) {
323 kernelHandles[fn.getSymName()] = fn;
324 kernelStubs[fn] = fn;
325 return fn;
326 }
327
328 // Create a new CIR global variable to represent the kernel handle
329 CIRGenBuilderTy &builder = cgm.getBuilder();
330 StringRef globalName = cgm.getMangledName(
331 gd.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
332 cir::GlobalOp globalOp = CIRGenModule::createGlobalOp(
333 cgm, fn.getLoc(), globalName, fn.getFunctionType(),
334 /*isConstant=*/true);
335
336 globalOp->setAttr("alignment", builder.getI64IntegerAttr(
338
339 // Store references
340 kernelHandles[fn.getSymName()] = globalOp;
341 kernelStubs[globalOp] = fn;
342
343 return globalOp;
344}
Defines the clang::ASTContext interface.
Provides definitions for the various language-specific address spaces.
__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim
__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim
mlir::Value createCast(mlir::Location loc, cir::CastKind kind, mlir::Value src, mlir::Type newTy)
cir::PtrStrideOp createPtrStride(mlir::Location loc, mlir::Value base, mlir::Value stride)
mlir::Value createBitcast(mlir::Value src, mlir::Type newTy)
mlir::Value createAlloca(mlir::Location loc, cir::PointerType addrType, mlir::Type type, llvm::StringRef name, mlir::IntegerAttr alignment, mlir::Value dynAllocSize)
TranslationUnitDecl * getTranslationUnitDecl() const
IdentifierTable & Idents
Definition ASTContext.h:797
const TargetInfo & getTargetInfo() const
Definition ASTContext.h:916
mlir::Value getPointer() const
Definition Address.h:96
cir::ConstantOp getConstInt(mlir::Location loc, llvm::APSInt intVal)
static CIRGenCallee forDirect(mlir::Operation *funcPtr, const CIRGenCalleeInfo &abstractInfo=CIRGenCalleeInfo())
Definition CIRGenCall.h:92
CIRGenTypes & getTypes() const
const clang::LangOptions & getLangOpts() const
const clang::Decl * curFuncDecl
Address getAddrOfLocalVar(const clang::VarDecl *vd)
Return the address of a local variable.
RValue emitCall(const CIRGenFunctionInfo &funcInfo, const CIRGenCallee &callee, ReturnValueSlot returnValue, const CallArgList &args, cir::CIRCallOpInterface *callOp, mlir::Location loc)
mlir::Value emitRuntimeCall(mlir::Location loc, cir::FuncOp callee, llvm::ArrayRef< mlir::Value > args={})
This class organizes the cross-function state that is used while generating CIR code.
llvm::StringRef getMangledName(clang::GlobalDecl gd)
DiagnosticBuilder errorNYI(SourceLocation, llvm::StringRef)
Helpers to emit "not yet implemented" error diagnostics.
clang::ASTContext & getASTContext() const
cir::FuncOp createRuntimeFunction(cir::FuncType ty, llvm::StringRef name, mlir::ArrayAttr={}, bool isLocal=false, bool assumeConvergent=false)
CIRGenBuilderTy & getBuilder()
const clang::TargetInfo & getTarget() const
void error(SourceLocation loc, llvm::StringRef error)
Emit a general error that something can't be done.
const clang::LangOptions & getLangOpts() const
static cir::GlobalOp createGlobalOp(CIRGenModule &cgm, mlir::Location loc, llvm::StringRef name, mlir::Type t, bool isConstant=false, mlir::Operation *insertPoint=nullptr)
const CIRGenFunctionInfo & arrangeFunctionDeclaration(const clang::FunctionDecl *fd)
Free functions are functions that are compatible with an ordinary C function pointer type.
mlir::Type convertType(clang::QualType type)
Convert a Clang type into a mlir::Type.
void add(RValue rvalue, clang::QualType type)
Definition CIRGenCall.h:235
Type for representing both the decl and type of parameters to a function.
Definition CIRGenCall.h:193
static RValue get(mlir::Value v)
Definition CIRGenValue.h:83
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition CharUnits.h:185
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition CharUnits.h:63
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
SourceLocation getLocation() const
Definition DeclBase.h:439
const ParmVarDecl * getParamDecl(unsigned i) const
Definition Decl.h:2797
GlobalDecl - represents a global declaration.
Definition GlobalDecl.h:57
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
Definition GlobalDecl.h:203
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
const llvm::VersionTuple & getSDKVersion() const
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
Definition Decl.h:151
QualType getType() const
Definition Decl.h:723
CIRGenCUDARuntime * createNVCUDARuntime(CIRGenModule &cgm)
The JSON file list parser is used to communicate input to InstallAPI.
if(T->getSizeExpr()) TRY_TO(TraverseStmt(const_cast< Expr * >(T -> getSizeExpr())))
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Definition Cuda.cpp:163
U cast(CodeGen::Address addr)
Definition Address.h:327
clang::CharUnits getPointerAlign() const
clang::CharUnits getSizeAlign() const
cir::PointerType voidPtrTy
void* in address space 0