clang 23.0.0git
NVPTX.cpp
Go to the documentation of this file.
1//===-------- NVPTX.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
15#include "llvm/IR/IntrinsicsNVPTX.h"
16
17using namespace clang;
18using namespace CodeGen;
19using namespace llvm;
20
21namespace {
22// Helper classes for mapping MMA builtins to particular LLVM intrinsic variant.
23struct NVPTXMmaLdstInfo {
24 unsigned NumResults; // Number of elements to load/store
25 // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported.
26 unsigned IID_col;
27 unsigned IID_row;
28};
29
30#define MMA_INTR(geom_op_type, layout) \
31 Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride
32#define MMA_LDST(n, geom_op_type) \
33 { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) }
34
35static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) {
36 switch (BuiltinID) {
37 // FP MMA loads
38 case NVPTX::BI__hmma_m16n16k16_ld_a:
39 return MMA_LDST(8, m16n16k16_load_a_f16);
40 case NVPTX::BI__hmma_m16n16k16_ld_b:
41 return MMA_LDST(8, m16n16k16_load_b_f16);
42 case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
43 return MMA_LDST(4, m16n16k16_load_c_f16);
44 case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
45 return MMA_LDST(8, m16n16k16_load_c_f32);
46 case NVPTX::BI__hmma_m32n8k16_ld_a:
47 return MMA_LDST(8, m32n8k16_load_a_f16);
48 case NVPTX::BI__hmma_m32n8k16_ld_b:
49 return MMA_LDST(8, m32n8k16_load_b_f16);
50 case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
51 return MMA_LDST(4, m32n8k16_load_c_f16);
52 case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
53 return MMA_LDST(8, m32n8k16_load_c_f32);
54 case NVPTX::BI__hmma_m8n32k16_ld_a:
55 return MMA_LDST(8, m8n32k16_load_a_f16);
56 case NVPTX::BI__hmma_m8n32k16_ld_b:
57 return MMA_LDST(8, m8n32k16_load_b_f16);
58 case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
59 return MMA_LDST(4, m8n32k16_load_c_f16);
60 case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
61 return MMA_LDST(8, m8n32k16_load_c_f32);
62
63 // Integer MMA loads
64 case NVPTX::BI__imma_m16n16k16_ld_a_s8:
65 return MMA_LDST(2, m16n16k16_load_a_s8);
66 case NVPTX::BI__imma_m16n16k16_ld_a_u8:
67 return MMA_LDST(2, m16n16k16_load_a_u8);
68 case NVPTX::BI__imma_m16n16k16_ld_b_s8:
69 return MMA_LDST(2, m16n16k16_load_b_s8);
70 case NVPTX::BI__imma_m16n16k16_ld_b_u8:
71 return MMA_LDST(2, m16n16k16_load_b_u8);
72 case NVPTX::BI__imma_m16n16k16_ld_c:
73 return MMA_LDST(8, m16n16k16_load_c_s32);
74 case NVPTX::BI__imma_m32n8k16_ld_a_s8:
75 return MMA_LDST(4, m32n8k16_load_a_s8);
76 case NVPTX::BI__imma_m32n8k16_ld_a_u8:
77 return MMA_LDST(4, m32n8k16_load_a_u8);
78 case NVPTX::BI__imma_m32n8k16_ld_b_s8:
79 return MMA_LDST(1, m32n8k16_load_b_s8);
80 case NVPTX::BI__imma_m32n8k16_ld_b_u8:
81 return MMA_LDST(1, m32n8k16_load_b_u8);
82 case NVPTX::BI__imma_m32n8k16_ld_c:
83 return MMA_LDST(8, m32n8k16_load_c_s32);
84 case NVPTX::BI__imma_m8n32k16_ld_a_s8:
85 return MMA_LDST(1, m8n32k16_load_a_s8);
86 case NVPTX::BI__imma_m8n32k16_ld_a_u8:
87 return MMA_LDST(1, m8n32k16_load_a_u8);
88 case NVPTX::BI__imma_m8n32k16_ld_b_s8:
89 return MMA_LDST(4, m8n32k16_load_b_s8);
90 case NVPTX::BI__imma_m8n32k16_ld_b_u8:
91 return MMA_LDST(4, m8n32k16_load_b_u8);
92 case NVPTX::BI__imma_m8n32k16_ld_c:
93 return MMA_LDST(8, m8n32k16_load_c_s32);
94
95 // Sub-integer MMA loads.
96 // Only row/col layout is supported by A/B fragments.
97 case NVPTX::BI__imma_m8n8k32_ld_a_s4:
98 return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)};
99 case NVPTX::BI__imma_m8n8k32_ld_a_u4:
100 return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)};
101 case NVPTX::BI__imma_m8n8k32_ld_b_s4:
102 return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0};
103 case NVPTX::BI__imma_m8n8k32_ld_b_u4:
104 return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0};
105 case NVPTX::BI__imma_m8n8k32_ld_c:
106 return MMA_LDST(2, m8n8k32_load_c_s32);
107 case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
108 return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)};
109 case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
110 return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0};
111 case NVPTX::BI__bmma_m8n8k128_ld_c:
112 return MMA_LDST(2, m8n8k128_load_c_s32);
113
114 // Double MMA loads
115 case NVPTX::BI__dmma_m8n8k4_ld_a:
116 return MMA_LDST(1, m8n8k4_load_a_f64);
117 case NVPTX::BI__dmma_m8n8k4_ld_b:
118 return MMA_LDST(1, m8n8k4_load_b_f64);
119 case NVPTX::BI__dmma_m8n8k4_ld_c:
120 return MMA_LDST(2, m8n8k4_load_c_f64);
121
122 // Alternate float MMA loads
123 case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
124 return MMA_LDST(4, m16n16k16_load_a_bf16);
125 case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
126 return MMA_LDST(4, m16n16k16_load_b_bf16);
127 case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
128 return MMA_LDST(2, m8n32k16_load_a_bf16);
129 case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
130 return MMA_LDST(8, m8n32k16_load_b_bf16);
131 case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
132 return MMA_LDST(8, m32n8k16_load_a_bf16);
133 case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
134 return MMA_LDST(2, m32n8k16_load_b_bf16);
135 case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
136 return MMA_LDST(4, m16n16k8_load_a_tf32);
137 case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
138 return MMA_LDST(4, m16n16k8_load_b_tf32);
139 case NVPTX::BI__mma_tf32_m16n16k8_ld_c:
140 return MMA_LDST(8, m16n16k8_load_c_f32);
141
142 // NOTE: We need to follow inconsitent naming scheme used by NVCC. Unlike
143 // PTX and LLVM IR where stores always use fragment D, NVCC builtins always
144 // use fragment C for both loads and stores.
145 // FP MMA stores.
146 case NVPTX::BI__hmma_m16n16k16_st_c_f16:
147 return MMA_LDST(4, m16n16k16_store_d_f16);
148 case NVPTX::BI__hmma_m16n16k16_st_c_f32:
149 return MMA_LDST(8, m16n16k16_store_d_f32);
150 case NVPTX::BI__hmma_m32n8k16_st_c_f16:
151 return MMA_LDST(4, m32n8k16_store_d_f16);
152 case NVPTX::BI__hmma_m32n8k16_st_c_f32:
153 return MMA_LDST(8, m32n8k16_store_d_f32);
154 case NVPTX::BI__hmma_m8n32k16_st_c_f16:
155 return MMA_LDST(4, m8n32k16_store_d_f16);
156 case NVPTX::BI__hmma_m8n32k16_st_c_f32:
157 return MMA_LDST(8, m8n32k16_store_d_f32);
158
159 // Integer and sub-integer MMA stores.
160 // Another naming quirk. Unlike other MMA builtins that use PTX types in the
161 // name, integer loads/stores use LLVM's i32.
162 case NVPTX::BI__imma_m16n16k16_st_c_i32:
163 return MMA_LDST(8, m16n16k16_store_d_s32);
164 case NVPTX::BI__imma_m32n8k16_st_c_i32:
165 return MMA_LDST(8, m32n8k16_store_d_s32);
166 case NVPTX::BI__imma_m8n32k16_st_c_i32:
167 return MMA_LDST(8, m8n32k16_store_d_s32);
168 case NVPTX::BI__imma_m8n8k32_st_c_i32:
169 return MMA_LDST(2, m8n8k32_store_d_s32);
170 case NVPTX::BI__bmma_m8n8k128_st_c_i32:
171 return MMA_LDST(2, m8n8k128_store_d_s32);
172
173 // Double MMA store
174 case NVPTX::BI__dmma_m8n8k4_st_c_f64:
175 return MMA_LDST(2, m8n8k4_store_d_f64);
176
177 // Alternate float MMA store
178 case NVPTX::BI__mma_m16n16k8_st_c_f32:
179 return MMA_LDST(8, m16n16k8_store_d_f32);
180
181 default:
182 llvm_unreachable("Unknown MMA builtin");
183 }
184}
185#undef MMA_LDST
186#undef MMA_INTR
187
188
189struct NVPTXMmaInfo {
190 unsigned NumEltsA;
191 unsigned NumEltsB;
192 unsigned NumEltsC;
193 unsigned NumEltsD;
194
195 // Variants are ordered by layout-A/layout-B/satf, where 'row' has priority
196 // over 'col' for layout. The index of non-satf variants is expected to match
197 // the undocumented layout constants used by CUDA's mma.hpp.
198 std::array<unsigned, 8> Variants;
199
200 unsigned getMMAIntrinsic(int Layout, bool Satf) {
201 unsigned Index = Layout + 4 * Satf;
202 if (Index >= Variants.size())
203 return 0;
204 return Variants[Index];
205 }
206};
207
208 // Returns an intrinsic that matches Layout and Satf for valid combinations of
209 // Layout and Satf, 0 otherwise.
210static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
211 // clang-format off
212#define MMA_VARIANTS(geom, type) \
213 Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \
214 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
215 Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \
216 Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type
217#define MMA_SATF_VARIANTS(geom, type) \
218 MMA_VARIANTS(geom, type), \
219 Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
220 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
221 Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
222 Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite
223// Sub-integer MMA only supports row.col layout.
224#define MMA_VARIANTS_I4(geom, type) \
225 0, \
226 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
227 0, \
228 0, \
229 0, \
230 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
231 0, \
232 0
233// b1 MMA does not support .satfinite.
234#define MMA_VARIANTS_B1_XOR(geom, type) \
235 0, \
236 Intrinsic::nvvm_wmma_##geom##_mma_xor_popc_row_col_##type, \
237 0, \
238 0, \
239 0, \
240 0, \
241 0, \
242 0
243#define MMA_VARIANTS_B1_AND(geom, type) \
244 0, \
245 Intrinsic::nvvm_wmma_##geom##_mma_and_popc_row_col_##type, \
246 0, \
247 0, \
248 0, \
249 0, \
250 0, \
251 0
252 // clang-format on
253 switch (BuiltinID) {
254 // FP MMA
255 // Note that 'type' argument of MMA_SATF_VARIANTS uses D_C notation, while
256 // NumEltsN of return value are ordered as A,B,C,D.
257 case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
258 return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f16)}}};
259 case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
260 return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f16)}}};
261 case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
262 return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f32)}}};
263 case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
264 return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f32)}}};
265 case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
266 return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f16)}}};
267 case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
268 return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f16)}}};
269 case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
270 return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f32)}}};
271 case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
272 return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f32)}}};
273 case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
274 return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f16)}}};
275 case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
276 return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f16)}}};
277 case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
278 return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f32)}}};
279 case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
280 return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f32)}}};
281
282 // Integer MMA
283 case NVPTX::BI__imma_m16n16k16_mma_s8:
284 return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, s8)}}};
285 case NVPTX::BI__imma_m16n16k16_mma_u8:
286 return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, u8)}}};
287 case NVPTX::BI__imma_m32n8k16_mma_s8:
288 return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, s8)}}};
289 case NVPTX::BI__imma_m32n8k16_mma_u8:
290 return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, u8)}}};
291 case NVPTX::BI__imma_m8n32k16_mma_s8:
292 return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, s8)}}};
293 case NVPTX::BI__imma_m8n32k16_mma_u8:
294 return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, u8)}}};
295
296 // Sub-integer MMA
297 case NVPTX::BI__imma_m8n8k32_mma_s4:
298 return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, s4)}}};
299 case NVPTX::BI__imma_m8n8k32_mma_u4:
300 return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, u4)}}};
301 case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
302 return {1, 1, 2, 2, {{MMA_VARIANTS_B1_XOR(m8n8k128, b1)}}};
303 case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
304 return {1, 1, 2, 2, {{MMA_VARIANTS_B1_AND(m8n8k128, b1)}}};
305
306 // Double MMA
307 case NVPTX::BI__dmma_m8n8k4_mma_f64:
308 return {1, 1, 2, 2, {{MMA_VARIANTS(m8n8k4, f64)}}};
309
310 // Alternate FP MMA
311 case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
312 return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k16, bf16)}}};
313 case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
314 return {2, 8, 8, 8, {{MMA_VARIANTS(m8n32k16, bf16)}}};
315 case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
316 return {8, 2, 8, 8, {{MMA_VARIANTS(m32n8k16, bf16)}}};
317 case NVPTX::BI__mma_tf32_m16n16k8_mma_f32:
318 return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k8, tf32)}}};
319 default:
320 llvm_unreachable("Unexpected builtin ID.");
321 }
322#undef MMA_VARIANTS
323#undef MMA_SATF_VARIANTS
324#undef MMA_VARIANTS_I4
325#undef MMA_VARIANTS_B1_AND
326#undef MMA_VARIANTS_B1_XOR
327}
328
329static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
330 const CallExpr *E) {
331 Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
332 QualType ArgType = E->getArg(0)->getType();
334 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
335 return CGF.Builder.CreateCall(
336 CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
337 {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
338}
339
340static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) {
341 Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
342 QualType ArgType = E->getArg(0)->getType();
344 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
345
346 // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
347 auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1));
348 auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign());
349 MDNode *MD = MDNode::get(CGF.Builder.getContext(), {});
350 LD->setMetadata(LLVMContext::MD_invariant_load, MD);
351
352 return LD;
353}
354
355static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
356 const CallExpr *E) {
357 Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
358 llvm::Type *ElemTy =
360 return CGF.Builder.CreateCall(
361 CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
362 {Ptr, CGF.EmitScalarExpr(E->getArg(1))});
363}
364
365static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
366 CodeGenFunction &CGF, const CallExpr *E,
367 int SrcSize) {
368 return E->getNumArgs() == 3
369 ? CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicIDS),
370 {CGF.EmitScalarExpr(E->getArg(0)),
371 CGF.EmitScalarExpr(E->getArg(1)),
372 CGF.EmitScalarExpr(E->getArg(2))})
373 : CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID),
374 {CGF.EmitScalarExpr(E->getArg(0)),
375 CGF.EmitScalarExpr(E->getArg(1))});
376}
377
378static bool EnsureNativeHalfSupport(unsigned BuiltinID, const CallExpr *E,
379 CodeGenFunction &CGF) {
380 auto &C = CGF.CGM.getContext();
381 if (!C.getLangOpts().NativeHalfType &&
382 C.getTargetInfo().useFP16ConversionIntrinsics()) {
383 CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getQuotedName(BuiltinID) +
384 " requires native half type support.");
385 return false;
386 }
387 return true;
388}
389
390static Value *MakeHalfType(Function *Intrinsic, unsigned BuiltinID,
391 const CallExpr *E, CodeGenFunction &CGF) {
392 if (!EnsureNativeHalfSupport(BuiltinID, E, CGF))
393 return nullptr;
394
396 auto *FTy = Intrinsic->getFunctionType();
397 unsigned ICEArguments = 0;
399 CGF.CGM.getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
400 assert(Error == ASTContext::GE_None && "Should not codegen an error");
401 for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
402 assert((ICEArguments & (1 << i)) == 0);
403 auto *ArgValue = CGF.EmitScalarExpr(E->getArg(i));
404 auto *PTy = FTy->getParamType(i);
405 if (PTy != ArgValue->getType())
406 ArgValue = CGF.Builder.CreateBitCast(ArgValue, PTy);
407 Args.push_back(ArgValue);
408 }
409
410 return CGF.Builder.CreateCall(Intrinsic, Args);
411}
412
413static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
414 const CallExpr *E, CodeGenFunction &CGF) {
415 return MakeHalfType(CGF.CGM.getIntrinsic(IntrinsicID), BuiltinID, E, CGF);
416}
417
418static Value *MakeFMAOOB(unsigned IntrinsicID, llvm::Type *Ty,
419 const CallExpr *E, CodeGenFunction &CGF) {
420 return CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID, {Ty}),
421 {CGF.EmitScalarExpr(E->getArg(0)),
422 CGF.EmitScalarExpr(E->getArg(1)),
423 CGF.EmitScalarExpr(E->getArg(2))});
424}
425
426} // namespace
427
429 const CallExpr *E) {
430 switch (BuiltinID) {
431 case NVPTX::BI__nvvm_atom_add_gen_i:
432 case NVPTX::BI__nvvm_atom_add_gen_l:
433 case NVPTX::BI__nvvm_atom_add_gen_ll:
434 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E,
435 AtomicOrdering::Monotonic);
436
437 case NVPTX::BI__nvvm_atom_sub_gen_i:
438 case NVPTX::BI__nvvm_atom_sub_gen_l:
439 case NVPTX::BI__nvvm_atom_sub_gen_ll:
440 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E,
441 AtomicOrdering::Monotonic);
442
443 case NVPTX::BI__nvvm_atom_and_gen_i:
444 case NVPTX::BI__nvvm_atom_and_gen_l:
445 case NVPTX::BI__nvvm_atom_and_gen_ll:
446 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E,
447 AtomicOrdering::Monotonic);
448
449 case NVPTX::BI__nvvm_atom_or_gen_i:
450 case NVPTX::BI__nvvm_atom_or_gen_l:
451 case NVPTX::BI__nvvm_atom_or_gen_ll:
452 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E,
453 AtomicOrdering::Monotonic);
454
455 case NVPTX::BI__nvvm_atom_xor_gen_i:
456 case NVPTX::BI__nvvm_atom_xor_gen_l:
457 case NVPTX::BI__nvvm_atom_xor_gen_ll:
458 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E,
459 AtomicOrdering::Monotonic);
460
461 case NVPTX::BI__nvvm_atom_xchg_gen_i:
462 case NVPTX::BI__nvvm_atom_xchg_gen_l:
463 case NVPTX::BI__nvvm_atom_xchg_gen_ll:
464 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E,
465 AtomicOrdering::Monotonic);
466
467 case NVPTX::BI__nvvm_atom_max_gen_i:
468 case NVPTX::BI__nvvm_atom_max_gen_l:
469 case NVPTX::BI__nvvm_atom_max_gen_ll:
470 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E,
471 AtomicOrdering::Monotonic);
472
473 case NVPTX::BI__nvvm_atom_max_gen_ui:
474 case NVPTX::BI__nvvm_atom_max_gen_ul:
475 case NVPTX::BI__nvvm_atom_max_gen_ull:
476 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E,
477 AtomicOrdering::Monotonic);
478
479 case NVPTX::BI__nvvm_atom_min_gen_i:
480 case NVPTX::BI__nvvm_atom_min_gen_l:
481 case NVPTX::BI__nvvm_atom_min_gen_ll:
482 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E,
483 AtomicOrdering::Monotonic);
484
485 case NVPTX::BI__nvvm_atom_min_gen_ui:
486 case NVPTX::BI__nvvm_atom_min_gen_ul:
487 case NVPTX::BI__nvvm_atom_min_gen_ull:
488 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E,
489 AtomicOrdering::Monotonic);
490
491 case NVPTX::BI__nvvm_atom_cas_gen_us:
492 case NVPTX::BI__nvvm_atom_cas_gen_i:
493 case NVPTX::BI__nvvm_atom_cas_gen_l:
494 case NVPTX::BI__nvvm_atom_cas_gen_ll:
495 // __nvvm_atom_cas_gen_* should return the old value rather than the
496 // success flag.
497 return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false,
498 AtomicOrdering::Monotonic,
499 AtomicOrdering::Monotonic);
500
501 case NVPTX::BI__nvvm_atom_add_gen_f:
502 case NVPTX::BI__nvvm_atom_add_gen_d: {
503 Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
504 Value *Val = EmitScalarExpr(E->getArg(1));
505
506 return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, DestAddr, Val,
507 AtomicOrdering::Monotonic);
508 }
509
510 case NVPTX::BI__nvvm_atom_inc_gen_ui:
511 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E,
512 AtomicOrdering::Monotonic);
513
514 case NVPTX::BI__nvvm_atom_dec_gen_ui:
515 return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E,
516 AtomicOrdering::Monotonic);
517
518 case NVPTX::BI__nvvm_ldg_c:
519 case NVPTX::BI__nvvm_ldg_sc:
520 case NVPTX::BI__nvvm_ldg_c2:
521 case NVPTX::BI__nvvm_ldg_sc2:
522 case NVPTX::BI__nvvm_ldg_c4:
523 case NVPTX::BI__nvvm_ldg_sc4:
524 case NVPTX::BI__nvvm_ldg_s:
525 case NVPTX::BI__nvvm_ldg_s2:
526 case NVPTX::BI__nvvm_ldg_s4:
527 case NVPTX::BI__nvvm_ldg_i:
528 case NVPTX::BI__nvvm_ldg_i2:
529 case NVPTX::BI__nvvm_ldg_i4:
530 case NVPTX::BI__nvvm_ldg_l:
531 case NVPTX::BI__nvvm_ldg_l2:
532 case NVPTX::BI__nvvm_ldg_ll:
533 case NVPTX::BI__nvvm_ldg_ll2:
534 case NVPTX::BI__nvvm_ldg_uc:
535 case NVPTX::BI__nvvm_ldg_uc2:
536 case NVPTX::BI__nvvm_ldg_uc4:
537 case NVPTX::BI__nvvm_ldg_us:
538 case NVPTX::BI__nvvm_ldg_us2:
539 case NVPTX::BI__nvvm_ldg_us4:
540 case NVPTX::BI__nvvm_ldg_ui:
541 case NVPTX::BI__nvvm_ldg_ui2:
542 case NVPTX::BI__nvvm_ldg_ui4:
543 case NVPTX::BI__nvvm_ldg_ul:
544 case NVPTX::BI__nvvm_ldg_ul2:
545 case NVPTX::BI__nvvm_ldg_ull:
546 case NVPTX::BI__nvvm_ldg_ull2:
547 case NVPTX::BI__nvvm_ldg_f:
548 case NVPTX::BI__nvvm_ldg_f2:
549 case NVPTX::BI__nvvm_ldg_f4:
550 case NVPTX::BI__nvvm_ldg_d:
551 case NVPTX::BI__nvvm_ldg_d2:
552 // PTX Interoperability section 2.2: "For a vector with an even number of
553 // elements, its alignment is set to number of elements times the alignment
554 // of its member: n*alignof(t)."
555 return MakeLdg(*this, E);
556
557 case NVPTX::BI__nvvm_ldu_c:
558 case NVPTX::BI__nvvm_ldu_sc:
559 case NVPTX::BI__nvvm_ldu_c2:
560 case NVPTX::BI__nvvm_ldu_sc2:
561 case NVPTX::BI__nvvm_ldu_c4:
562 case NVPTX::BI__nvvm_ldu_sc4:
563 case NVPTX::BI__nvvm_ldu_s:
564 case NVPTX::BI__nvvm_ldu_s2:
565 case NVPTX::BI__nvvm_ldu_s4:
566 case NVPTX::BI__nvvm_ldu_i:
567 case NVPTX::BI__nvvm_ldu_i2:
568 case NVPTX::BI__nvvm_ldu_i4:
569 case NVPTX::BI__nvvm_ldu_l:
570 case NVPTX::BI__nvvm_ldu_l2:
571 case NVPTX::BI__nvvm_ldu_ll:
572 case NVPTX::BI__nvvm_ldu_ll2:
573 case NVPTX::BI__nvvm_ldu_uc:
574 case NVPTX::BI__nvvm_ldu_uc2:
575 case NVPTX::BI__nvvm_ldu_uc4:
576 case NVPTX::BI__nvvm_ldu_us:
577 case NVPTX::BI__nvvm_ldu_us2:
578 case NVPTX::BI__nvvm_ldu_us4:
579 case NVPTX::BI__nvvm_ldu_ui:
580 case NVPTX::BI__nvvm_ldu_ui2:
581 case NVPTX::BI__nvvm_ldu_ui4:
582 case NVPTX::BI__nvvm_ldu_ul:
583 case NVPTX::BI__nvvm_ldu_ul2:
584 case NVPTX::BI__nvvm_ldu_ull:
585 case NVPTX::BI__nvvm_ldu_ull2:
586 return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
587 case NVPTX::BI__nvvm_ldu_f:
588 case NVPTX::BI__nvvm_ldu_f2:
589 case NVPTX::BI__nvvm_ldu_f4:
590 case NVPTX::BI__nvvm_ldu_d:
591 case NVPTX::BI__nvvm_ldu_d2:
592 return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
593
594 case NVPTX::BI__nvvm_atom_cta_add_gen_i:
595 case NVPTX::BI__nvvm_atom_cta_add_gen_l:
596 case NVPTX::BI__nvvm_atom_cta_add_gen_ll:
597 return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta, *this, E);
598 case NVPTX::BI__nvvm_atom_sys_add_gen_i:
599 case NVPTX::BI__nvvm_atom_sys_add_gen_l:
600 case NVPTX::BI__nvvm_atom_sys_add_gen_ll:
601 return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys, *this, E);
602 case NVPTX::BI__nvvm_atom_cta_add_gen_f:
603 case NVPTX::BI__nvvm_atom_cta_add_gen_d:
604 return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta, *this, E);
605 case NVPTX::BI__nvvm_atom_sys_add_gen_f:
606 case NVPTX::BI__nvvm_atom_sys_add_gen_d:
607 return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys, *this, E);
608 case NVPTX::BI__nvvm_atom_cta_xchg_gen_i:
609 case NVPTX::BI__nvvm_atom_cta_xchg_gen_l:
610 case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll:
611 return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta, *this, E);
612 case NVPTX::BI__nvvm_atom_sys_xchg_gen_i:
613 case NVPTX::BI__nvvm_atom_sys_xchg_gen_l:
614 case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll:
615 return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys, *this, E);
616 case NVPTX::BI__nvvm_atom_cta_max_gen_i:
617 case NVPTX::BI__nvvm_atom_cta_max_gen_ui:
618 case NVPTX::BI__nvvm_atom_cta_max_gen_l:
619 case NVPTX::BI__nvvm_atom_cta_max_gen_ul:
620 case NVPTX::BI__nvvm_atom_cta_max_gen_ll:
621 case NVPTX::BI__nvvm_atom_cta_max_gen_ull:
622 return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta, *this, E);
623 case NVPTX::BI__nvvm_atom_sys_max_gen_i:
624 case NVPTX::BI__nvvm_atom_sys_max_gen_ui:
625 case NVPTX::BI__nvvm_atom_sys_max_gen_l:
626 case NVPTX::BI__nvvm_atom_sys_max_gen_ul:
627 case NVPTX::BI__nvvm_atom_sys_max_gen_ll:
628 case NVPTX::BI__nvvm_atom_sys_max_gen_ull:
629 return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys, *this, E);
630 case NVPTX::BI__nvvm_atom_cta_min_gen_i:
631 case NVPTX::BI__nvvm_atom_cta_min_gen_ui:
632 case NVPTX::BI__nvvm_atom_cta_min_gen_l:
633 case NVPTX::BI__nvvm_atom_cta_min_gen_ul:
634 case NVPTX::BI__nvvm_atom_cta_min_gen_ll:
635 case NVPTX::BI__nvvm_atom_cta_min_gen_ull:
636 return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta, *this, E);
637 case NVPTX::BI__nvvm_atom_sys_min_gen_i:
638 case NVPTX::BI__nvvm_atom_sys_min_gen_ui:
639 case NVPTX::BI__nvvm_atom_sys_min_gen_l:
640 case NVPTX::BI__nvvm_atom_sys_min_gen_ul:
641 case NVPTX::BI__nvvm_atom_sys_min_gen_ll:
642 case NVPTX::BI__nvvm_atom_sys_min_gen_ull:
643 return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys, *this, E);
644 case NVPTX::BI__nvvm_atom_cta_inc_gen_ui:
645 return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta, *this, E);
646 case NVPTX::BI__nvvm_atom_cta_dec_gen_ui:
647 return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta, *this, E);
648 case NVPTX::BI__nvvm_atom_sys_inc_gen_ui:
649 return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys, *this, E);
650 case NVPTX::BI__nvvm_atom_sys_dec_gen_ui:
651 return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys, *this, E);
652 case NVPTX::BI__nvvm_atom_cta_and_gen_i:
653 case NVPTX::BI__nvvm_atom_cta_and_gen_l:
654 case NVPTX::BI__nvvm_atom_cta_and_gen_ll:
655 return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta, *this, E);
656 case NVPTX::BI__nvvm_atom_sys_and_gen_i:
657 case NVPTX::BI__nvvm_atom_sys_and_gen_l:
658 case NVPTX::BI__nvvm_atom_sys_and_gen_ll:
659 return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys, *this, E);
660 case NVPTX::BI__nvvm_atom_cta_or_gen_i:
661 case NVPTX::BI__nvvm_atom_cta_or_gen_l:
662 case NVPTX::BI__nvvm_atom_cta_or_gen_ll:
663 return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta, *this, E);
664 case NVPTX::BI__nvvm_atom_sys_or_gen_i:
665 case NVPTX::BI__nvvm_atom_sys_or_gen_l:
666 case NVPTX::BI__nvvm_atom_sys_or_gen_ll:
667 return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys, *this, E);
668 case NVPTX::BI__nvvm_atom_cta_xor_gen_i:
669 case NVPTX::BI__nvvm_atom_cta_xor_gen_l:
670 case NVPTX::BI__nvvm_atom_cta_xor_gen_ll:
671 return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta, *this, E);
672 case NVPTX::BI__nvvm_atom_sys_xor_gen_i:
673 case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
674 case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
675 return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E);
676 case NVPTX::BI__nvvm_atom_cta_cas_gen_us:
677 case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
678 case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
679 case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
680 Value *Ptr = EmitScalarExpr(E->getArg(0));
681 llvm::Type *ElemTy =
683 return Builder.CreateCall(
684 CGM.getIntrinsic(
685 Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}),
686 {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
687 }
688 case NVPTX::BI__nvvm_atom_sys_cas_gen_us:
689 case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
690 case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
691 case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
692 Value *Ptr = EmitScalarExpr(E->getArg(0));
693 llvm::Type *ElemTy =
695 return Builder.CreateCall(
696 CGM.getIntrinsic(
697 Intrinsic::nvvm_atomic_cas_gen_i_sys, {ElemTy, Ptr->getType()}),
698 {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
699 }
700 case NVPTX::BI__nvvm_match_all_sync_i32p:
701 case NVPTX::BI__nvvm_match_all_sync_i64p: {
702 Value *Mask = EmitScalarExpr(E->getArg(0));
703 Value *Val = EmitScalarExpr(E->getArg(1));
704 Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2));
705 Value *ResultPair = Builder.CreateCall(
706 CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p
707 ? Intrinsic::nvvm_match_all_sync_i32p
708 : Intrinsic::nvvm_match_all_sync_i64p),
709 {Mask, Val});
710 Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1),
711 PredOutPtr.getElementType());
712 Builder.CreateStore(Pred, PredOutPtr);
713 return Builder.CreateExtractValue(ResultPair, 0);
714 }
715
716 // FP MMA loads
717 case NVPTX::BI__hmma_m16n16k16_ld_a:
718 case NVPTX::BI__hmma_m16n16k16_ld_b:
719 case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
720 case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
721 case NVPTX::BI__hmma_m32n8k16_ld_a:
722 case NVPTX::BI__hmma_m32n8k16_ld_b:
723 case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
724 case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
725 case NVPTX::BI__hmma_m8n32k16_ld_a:
726 case NVPTX::BI__hmma_m8n32k16_ld_b:
727 case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
728 case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
729 // Integer MMA loads.
730 case NVPTX::BI__imma_m16n16k16_ld_a_s8:
731 case NVPTX::BI__imma_m16n16k16_ld_a_u8:
732 case NVPTX::BI__imma_m16n16k16_ld_b_s8:
733 case NVPTX::BI__imma_m16n16k16_ld_b_u8:
734 case NVPTX::BI__imma_m16n16k16_ld_c:
735 case NVPTX::BI__imma_m32n8k16_ld_a_s8:
736 case NVPTX::BI__imma_m32n8k16_ld_a_u8:
737 case NVPTX::BI__imma_m32n8k16_ld_b_s8:
738 case NVPTX::BI__imma_m32n8k16_ld_b_u8:
739 case NVPTX::BI__imma_m32n8k16_ld_c:
740 case NVPTX::BI__imma_m8n32k16_ld_a_s8:
741 case NVPTX::BI__imma_m8n32k16_ld_a_u8:
742 case NVPTX::BI__imma_m8n32k16_ld_b_s8:
743 case NVPTX::BI__imma_m8n32k16_ld_b_u8:
744 case NVPTX::BI__imma_m8n32k16_ld_c:
745 // Sub-integer MMA loads.
746 case NVPTX::BI__imma_m8n8k32_ld_a_s4:
747 case NVPTX::BI__imma_m8n8k32_ld_a_u4:
748 case NVPTX::BI__imma_m8n8k32_ld_b_s4:
749 case NVPTX::BI__imma_m8n8k32_ld_b_u4:
750 case NVPTX::BI__imma_m8n8k32_ld_c:
751 case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
752 case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
753 case NVPTX::BI__bmma_m8n8k128_ld_c:
754 // Double MMA loads.
755 case NVPTX::BI__dmma_m8n8k4_ld_a:
756 case NVPTX::BI__dmma_m8n8k4_ld_b:
757 case NVPTX::BI__dmma_m8n8k4_ld_c:
758 // Alternate float MMA loads.
759 case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
760 case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
761 case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
762 case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
763 case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
764 case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
765 case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
766 case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
767 case NVPTX::BI__mma_tf32_m16n16k8_ld_c: {
769 Value *Src = EmitScalarExpr(E->getArg(1));
770 Value *Ldm = EmitScalarExpr(E->getArg(2));
771 std::optional<llvm::APSInt> isColMajorArg =
773 if (!isColMajorArg)
774 return nullptr;
775 bool isColMajor = isColMajorArg->getSExtValue();
776 NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
777 unsigned IID = isColMajor ? II.IID_col : II.IID_row;
778 if (IID == 0)
779 return nullptr;
780
781 Value *Result =
782 Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
783
784 // Save returned values.
785 assert(II.NumResults);
786 if (II.NumResults == 1) {
787 Builder.CreateAlignedStore(Result, Dst.emitRawPointer(*this),
789 } else {
790 for (unsigned i = 0; i < II.NumResults; ++i) {
791 Builder.CreateAlignedStore(
792 Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
793 Dst.getElementType()),
794 Builder.CreateGEP(Dst.getElementType(), Dst.emitRawPointer(*this),
795 llvm::ConstantInt::get(IntTy, i)),
797 }
798 }
799 return Result;
800 }
801
802 case NVPTX::BI__hmma_m16n16k16_st_c_f16:
803 case NVPTX::BI__hmma_m16n16k16_st_c_f32:
804 case NVPTX::BI__hmma_m32n8k16_st_c_f16:
805 case NVPTX::BI__hmma_m32n8k16_st_c_f32:
806 case NVPTX::BI__hmma_m8n32k16_st_c_f16:
807 case NVPTX::BI__hmma_m8n32k16_st_c_f32:
808 case NVPTX::BI__imma_m16n16k16_st_c_i32:
809 case NVPTX::BI__imma_m32n8k16_st_c_i32:
810 case NVPTX::BI__imma_m8n32k16_st_c_i32:
811 case NVPTX::BI__imma_m8n8k32_st_c_i32:
812 case NVPTX::BI__bmma_m8n8k128_st_c_i32:
813 case NVPTX::BI__dmma_m8n8k4_st_c_f64:
814 case NVPTX::BI__mma_m16n16k8_st_c_f32: {
815 Value *Dst = EmitScalarExpr(E->getArg(0));
817 Value *Ldm = EmitScalarExpr(E->getArg(2));
818 std::optional<llvm::APSInt> isColMajorArg =
820 if (!isColMajorArg)
821 return nullptr;
822 bool isColMajor = isColMajorArg->getSExtValue();
823 NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
824 unsigned IID = isColMajor ? II.IID_col : II.IID_row;
825 if (IID == 0)
826 return nullptr;
827 Function *Intrinsic =
828 CGM.getIntrinsic(IID, Dst->getType());
829 llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
830 SmallVector<Value *, 10> Values = {Dst};
831 for (unsigned i = 0; i < II.NumResults; ++i) {
832 Value *V = Builder.CreateAlignedLoad(
833 Src.getElementType(),
834 Builder.CreateGEP(Src.getElementType(), Src.emitRawPointer(*this),
835 llvm::ConstantInt::get(IntTy, i)),
837 Values.push_back(Builder.CreateBitCast(V, ParamType));
838 }
839 Values.push_back(Ldm);
840 Value *Result = Builder.CreateCall(Intrinsic, Values);
841 return Result;
842 }
843
844 // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
845 // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
846 case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
847 case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
848 case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
849 case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
850 case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
851 case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
852 case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
853 case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
854 case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
855 case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
856 case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
857 case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
858 case NVPTX::BI__imma_m16n16k16_mma_s8:
859 case NVPTX::BI__imma_m16n16k16_mma_u8:
860 case NVPTX::BI__imma_m32n8k16_mma_s8:
861 case NVPTX::BI__imma_m32n8k16_mma_u8:
862 case NVPTX::BI__imma_m8n32k16_mma_s8:
863 case NVPTX::BI__imma_m8n32k16_mma_u8:
864 case NVPTX::BI__imma_m8n8k32_mma_s4:
865 case NVPTX::BI__imma_m8n8k32_mma_u4:
866 case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
867 case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
868 case NVPTX::BI__dmma_m8n8k4_mma_f64:
869 case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
870 case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
871 case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
872 case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: {
877 std::optional<llvm::APSInt> LayoutArg =
879 if (!LayoutArg)
880 return nullptr;
881 int Layout = LayoutArg->getSExtValue();
882 if (Layout < 0 || Layout > 3)
883 return nullptr;
884 llvm::APSInt SatfArg;
885 if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1 ||
886 BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1)
887 SatfArg = 0; // .b1 does not have satf argument.
888 else if (std::optional<llvm::APSInt> OptSatfArg =
890 SatfArg = *OptSatfArg;
891 else
892 return nullptr;
893 bool Satf = SatfArg.getSExtValue();
894 NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID);
895 unsigned IID = MI.getMMAIntrinsic(Layout, Satf);
896 if (IID == 0) // Unsupported combination of Layout/Satf.
897 return nullptr;
898
900 Function *Intrinsic = CGM.getIntrinsic(IID);
901 llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0);
902 // Load A
903 for (unsigned i = 0; i < MI.NumEltsA; ++i) {
904 Value *V = Builder.CreateAlignedLoad(
905 SrcA.getElementType(),
906 Builder.CreateGEP(SrcA.getElementType(), SrcA.emitRawPointer(*this),
907 llvm::ConstantInt::get(IntTy, i)),
909 Values.push_back(Builder.CreateBitCast(V, AType));
910 }
911 // Load B
912 llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA);
913 for (unsigned i = 0; i < MI.NumEltsB; ++i) {
914 Value *V = Builder.CreateAlignedLoad(
915 SrcB.getElementType(),
916 Builder.CreateGEP(SrcB.getElementType(), SrcB.emitRawPointer(*this),
917 llvm::ConstantInt::get(IntTy, i)),
919 Values.push_back(Builder.CreateBitCast(V, BType));
920 }
921 // Load C
922 llvm::Type *CType =
923 Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB);
924 for (unsigned i = 0; i < MI.NumEltsC; ++i) {
925 Value *V = Builder.CreateAlignedLoad(
926 SrcC.getElementType(),
927 Builder.CreateGEP(SrcC.getElementType(), SrcC.emitRawPointer(*this),
928 llvm::ConstantInt::get(IntTy, i)),
930 Values.push_back(Builder.CreateBitCast(V, CType));
931 }
932 Value *Result = Builder.CreateCall(Intrinsic, Values);
933 llvm::Type *DType = Dst.getElementType();
934 for (unsigned i = 0; i < MI.NumEltsD; ++i)
935 Builder.CreateAlignedStore(
936 Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
937 Builder.CreateGEP(Dst.getElementType(), Dst.emitRawPointer(*this),
938 llvm::ConstantInt::get(IntTy, i)),
940 return Result;
941 }
942 // The following builtins require half type support
943 case NVPTX::BI__nvvm_ex2_approx_f16:
944 return MakeHalfType(
945 CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx, Builder.getHalfTy()),
946 BuiltinID, E, *this);
947 case NVPTX::BI__nvvm_ex2_approx_f16x2:
948 return MakeHalfType(
949 CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx,
950 FixedVectorType::get(Builder.getHalfTy(), 2)),
951 BuiltinID, E, *this);
952 case NVPTX::BI__nvvm_ff2f16x2_rn:
953 return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this);
954 case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
955 return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn_relu, BuiltinID, E, *this);
956 case NVPTX::BI__nvvm_ff2f16x2_rz:
957 return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz, BuiltinID, E, *this);
958 case NVPTX::BI__nvvm_ff2f16x2_rz_relu:
959 return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz_relu, BuiltinID, E, *this);
960 case NVPTX::BI__nvvm_fma_rn_f16:
961 return MakeHalfType(Intrinsic::nvvm_fma_rn_f16, BuiltinID, E, *this);
962 case NVPTX::BI__nvvm_fma_rn_f16x2:
963 return MakeHalfType(Intrinsic::nvvm_fma_rn_f16x2, BuiltinID, E, *this);
964 case NVPTX::BI__nvvm_fma_rn_ftz_f16:
965 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16, BuiltinID, E, *this);
966 case NVPTX::BI__nvvm_fma_rn_ftz_f16x2:
967 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16x2, BuiltinID, E, *this);
968 case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16:
969 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16, BuiltinID, E,
970 *this);
971 case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2:
972 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16x2, BuiltinID, E,
973 *this);
974 case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16:
975 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16, BuiltinID, E,
976 *this);
977 case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2:
978 return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16x2, BuiltinID, E,
979 *this);
980 case NVPTX::BI__nvvm_fma_rn_relu_f16:
981 return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16, BuiltinID, E, *this);
982 case NVPTX::BI__nvvm_fma_rn_relu_f16x2:
983 return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16x2, BuiltinID, E, *this);
984 case NVPTX::BI__nvvm_fma_rn_sat_f16:
985 return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16, BuiltinID, E, *this);
986 case NVPTX::BI__nvvm_fma_rn_sat_f16x2:
987 return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16x2, BuiltinID, E, *this);
988 case NVPTX::BI__nvvm_fma_rn_oob_f16:
989 return MakeFMAOOB(Intrinsic::nvvm_fma_rn_oob, Builder.getHalfTy(), E,
990 *this);
991 case NVPTX::BI__nvvm_fma_rn_oob_f16x2:
992 return MakeFMAOOB(Intrinsic::nvvm_fma_rn_oob,
993 llvm::FixedVectorType::get(Builder.getHalfTy(), 2), E,
994 *this);
995 case NVPTX::BI__nvvm_fma_rn_oob_bf16:
996 return MakeFMAOOB(Intrinsic::nvvm_fma_rn_oob, Builder.getBFloatTy(), E,
997 *this);
998 case NVPTX::BI__nvvm_fma_rn_oob_bf16x2:
999 return MakeFMAOOB(Intrinsic::nvvm_fma_rn_oob,
1000 llvm::FixedVectorType::get(Builder.getBFloatTy(), 2), E,
1001 *this);
1002 case NVPTX::BI__nvvm_fma_rn_oob_relu_f16:
1003 return MakeFMAOOB(Intrinsic::nvvm_fma_rn_oob_relu, Builder.getHalfTy(), E,
1004 *this);
1005 case NVPTX::BI__nvvm_fma_rn_oob_relu_f16x2:
1006 return MakeFMAOOB(Intrinsic::nvvm_fma_rn_oob_relu,
1007 llvm::FixedVectorType::get(Builder.getHalfTy(), 2), E,
1008 *this);
1009 case NVPTX::BI__nvvm_fma_rn_oob_relu_bf16:
1010 return MakeFMAOOB(Intrinsic::nvvm_fma_rn_oob_relu, Builder.getBFloatTy(), E,
1011 *this);
1012 case NVPTX::BI__nvvm_fma_rn_oob_relu_bf16x2:
1013 return MakeFMAOOB(Intrinsic::nvvm_fma_rn_oob_relu,
1014 llvm::FixedVectorType::get(Builder.getBFloatTy(), 2), E,
1015 *this);
1016 case NVPTX::BI__nvvm_fmax_f16:
1017 return MakeHalfType(Intrinsic::nvvm_fmax_f16, BuiltinID, E, *this);
1018 case NVPTX::BI__nvvm_fmax_f16x2:
1019 return MakeHalfType(Intrinsic::nvvm_fmax_f16x2, BuiltinID, E, *this);
1020 case NVPTX::BI__nvvm_fmax_ftz_f16:
1021 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16, BuiltinID, E, *this);
1022 case NVPTX::BI__nvvm_fmax_ftz_f16x2:
1023 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16x2, BuiltinID, E, *this);
1024 case NVPTX::BI__nvvm_fmax_ftz_nan_f16:
1025 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16, BuiltinID, E, *this);
1026 case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2:
1027 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16x2, BuiltinID, E,
1028 *this);
1029 case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16:
1030 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16, BuiltinID,
1031 E, *this);
1032 case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2:
1033 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16x2,
1034 BuiltinID, E, *this);
1035 case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16:
1036 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16, BuiltinID, E,
1037 *this);
1038 case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2:
1039 return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16x2, BuiltinID,
1040 E, *this);
1041 case NVPTX::BI__nvvm_fmax_nan_f16:
1042 return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16, BuiltinID, E, *this);
1043 case NVPTX::BI__nvvm_fmax_nan_f16x2:
1044 return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16x2, BuiltinID, E, *this);
1045 case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16:
1046 return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16, BuiltinID, E,
1047 *this);
1048 case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2:
1049 return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16x2, BuiltinID,
1050 E, *this);
1051 case NVPTX::BI__nvvm_fmax_xorsign_abs_f16:
1052 return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16, BuiltinID, E,
1053 *this);
1054 case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2:
1055 return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16x2, BuiltinID, E,
1056 *this);
1057 case NVPTX::BI__nvvm_fmin_f16:
1058 return MakeHalfType(Intrinsic::nvvm_fmin_f16, BuiltinID, E, *this);
1059 case NVPTX::BI__nvvm_fmin_f16x2:
1060 return MakeHalfType(Intrinsic::nvvm_fmin_f16x2, BuiltinID, E, *this);
1061 case NVPTX::BI__nvvm_fmin_ftz_f16:
1062 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16, BuiltinID, E, *this);
1063 case NVPTX::BI__nvvm_fmin_ftz_f16x2:
1064 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16x2, BuiltinID, E, *this);
1065 case NVPTX::BI__nvvm_fmin_ftz_nan_f16:
1066 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16, BuiltinID, E, *this);
1067 case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2:
1068 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16x2, BuiltinID, E,
1069 *this);
1070 case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16:
1071 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16, BuiltinID,
1072 E, *this);
1073 case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2:
1074 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
1075 BuiltinID, E, *this);
1076 case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16:
1077 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16, BuiltinID, E,
1078 *this);
1079 case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2:
1080 return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16x2, BuiltinID,
1081 E, *this);
1082 case NVPTX::BI__nvvm_fmin_nan_f16:
1083 return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16, BuiltinID, E, *this);
1084 case NVPTX::BI__nvvm_fmin_nan_f16x2:
1085 return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16x2, BuiltinID, E, *this);
1086 case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16:
1087 return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16, BuiltinID, E,
1088 *this);
1089 case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2:
1090 return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16x2, BuiltinID,
1091 E, *this);
1092 case NVPTX::BI__nvvm_fmin_xorsign_abs_f16:
1093 return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16, BuiltinID, E,
1094 *this);
1095 case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2:
1096 return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
1097 *this);
1098 case NVPTX::BI__nvvm_fabs_f:
1099 case NVPTX::BI__nvvm_abs_bf16:
1100 case NVPTX::BI__nvvm_abs_bf16x2:
1101 case NVPTX::BI__nvvm_fabs_f16:
1102 case NVPTX::BI__nvvm_fabs_f16x2:
1103 return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs,
1104 EmitScalarExpr(E->getArg(0)));
1105 case NVPTX::BI__nvvm_fabs_ftz_f:
1106 case NVPTX::BI__nvvm_fabs_ftz_f16:
1107 case NVPTX::BI__nvvm_fabs_ftz_f16x2:
1108 return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs_ftz,
1109 EmitScalarExpr(E->getArg(0)));
1110 case NVPTX::BI__nvvm_fabs_d:
1111 return Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
1112 EmitScalarExpr(E->getArg(0)));
1113 case NVPTX::BI__nvvm_ex2_approx_d:
1114 case NVPTX::BI__nvvm_ex2_approx_f:
1115 return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx,
1116 EmitScalarExpr(E->getArg(0)));
1117 case NVPTX::BI__nvvm_ex2_approx_ftz_f:
1118 return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx_ftz,
1119 EmitScalarExpr(E->getArg(0)));
1120 case NVPTX::BI__nvvm_ldg_h:
1121 case NVPTX::BI__nvvm_ldg_h2:
1122 return EnsureNativeHalfSupport(BuiltinID, E, *this) ? MakeLdg(*this, E)
1123 : nullptr;
1124 case NVPTX::BI__nvvm_ldu_h:
1125 case NVPTX::BI__nvvm_ldu_h2:
1126 return EnsureNativeHalfSupport(BuiltinID, E, *this)
1127 ? MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E)
1128 : nullptr;
1129 case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
1130 return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
1131 Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
1132 4);
1133 case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
1134 return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8,
1135 Intrinsic::nvvm_cp_async_ca_shared_global_8_s, *this, E,
1136 8);
1137 case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
1138 return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16,
1139 Intrinsic::nvvm_cp_async_ca_shared_global_16_s, *this, E,
1140 16);
1141 case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
1142 return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16,
1143 Intrinsic::nvvm_cp_async_cg_shared_global_16_s, *this, E,
1144 16);
1145 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_x:
1146 return Builder.CreateCall(
1147 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_x));
1148 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_y:
1149 return Builder.CreateCall(
1150 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_y));
1151 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_z:
1152 return Builder.CreateCall(
1153 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_z));
1154 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_w:
1155 return Builder.CreateCall(
1156 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_w));
1157 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_x:
1158 return Builder.CreateCall(
1159 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_x));
1160 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_y:
1161 return Builder.CreateCall(
1162 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_y));
1163 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_z:
1164 return Builder.CreateCall(
1165 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_z));
1166 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_w:
1167 return Builder.CreateCall(
1168 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_w));
1169 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_x:
1170 return Builder.CreateCall(
1171 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x));
1172 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_y:
1173 return Builder.CreateCall(
1174 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y));
1175 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_z:
1176 return Builder.CreateCall(
1177 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z));
1178 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_w:
1179 return Builder.CreateCall(
1180 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_w));
1181 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_x:
1182 return Builder.CreateCall(
1183 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x));
1184 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_y:
1185 return Builder.CreateCall(
1186 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y));
1187 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_z:
1188 return Builder.CreateCall(
1189 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z));
1190 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_w:
1191 return Builder.CreateCall(
1192 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_w));
1193 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctarank:
1194 return Builder.CreateCall(
1195 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank));
1196 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctarank:
1197 return Builder.CreateCall(
1198 CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank));
1199 case NVPTX::BI__nvvm_is_explicit_cluster:
1200 return Builder.CreateCall(
1201 CGM.getIntrinsic(Intrinsic::nvvm_is_explicit_cluster));
1202 case NVPTX::BI__nvvm_isspacep_shared_cluster:
1203 return Builder.CreateCall(
1204 CGM.getIntrinsic(Intrinsic::nvvm_isspacep_shared_cluster),
1205 EmitScalarExpr(E->getArg(0)));
1206 case NVPTX::BI__nvvm_mapa:
1207 return Builder.CreateCall(
1208 CGM.getIntrinsic(Intrinsic::nvvm_mapa),
1209 {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
1210 case NVPTX::BI__nvvm_mapa_shared_cluster:
1211 return Builder.CreateCall(
1212 CGM.getIntrinsic(Intrinsic::nvvm_mapa_shared_cluster),
1213 {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
1214 case NVPTX::BI__nvvm_getctarank:
1215 return Builder.CreateCall(
1216 CGM.getIntrinsic(Intrinsic::nvvm_getctarank),
1217 EmitScalarExpr(E->getArg(0)));
1218 case NVPTX::BI__nvvm_getctarank_shared_cluster:
1219 return Builder.CreateCall(
1220 CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster),
1221 EmitScalarExpr(E->getArg(0)));
1222 case NVPTX::BI__nvvm_barrier_cluster_arrive:
1223 return Builder.CreateCall(
1224 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive));
1225 case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
1226 return Builder.CreateCall(
1227 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
1228 case NVPTX::BI__nvvm_barrier_cluster_wait:
1229 return Builder.CreateCall(
1230 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait));
1231 case NVPTX::BI__nvvm_fence_sc_cluster:
1232 return Builder.CreateCall(
1233 CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
1234 case NVPTX::BI__nvvm_bar_sync:
1235 return Builder.CreateCall(
1236 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1237 EmitScalarExpr(E->getArg(0)));
1238 case NVPTX::BI__syncthreads:
1239 return Builder.CreateCall(
1240 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1241 Builder.getInt32(0));
1242 case NVPTX::BI__nvvm_barrier_sync:
1243 return Builder.CreateCall(
1244 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all),
1245 EmitScalarExpr(E->getArg(0)));
1246 case NVPTX::BI__nvvm_barrier_sync_cnt:
1247 return Builder.CreateCall(
1248 CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_count),
1249 {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
1250 case NVPTX::BI__nvvm_bar0_and:
1251 return Builder.CreateZExt(
1252 Builder.CreateIntrinsic(
1253 Intrinsic::nvvm_barrier_cta_red_and_aligned_all, {},
1254 {Builder.getInt32(0),
1255 Builder.CreateICmpNE(EmitScalarExpr(E->getArg(0)),
1256 Builder.getInt32(0))}),
1257 Builder.getInt32Ty());
1258 case NVPTX::BI__nvvm_bar0_or:
1259 return Builder.CreateZExt(
1260 Builder.CreateIntrinsic(
1261 Intrinsic::nvvm_barrier_cta_red_or_aligned_all, {},
1262 {Builder.getInt32(0),
1263 Builder.CreateICmpNE(EmitScalarExpr(E->getArg(0)),
1264 Builder.getInt32(0))}),
1265 Builder.getInt32Ty());
1266 case NVPTX::BI__nvvm_bar0_popc:
1267 return Builder.CreateIntrinsic(
1268 Intrinsic::nvvm_barrier_cta_red_popc_aligned_all, {},
1269 {Builder.getInt32(0), Builder.CreateICmpNE(EmitScalarExpr(E->getArg(0)),
1270 Builder.getInt32(0))});
1271 default:
1272 return nullptr;
1273 }
1274}
#define V(N, I)
Value * MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E, bool ReturnBool, llvm::AtomicOrdering SuccessOrdering, llvm::AtomicOrdering FailureOrdering)
Utility to insert an atomic cmpxchg instruction.
Value * MakeBinaryAtomicValue(CodeGenFunction &CGF, llvm::AtomicRMWInst::BinOp Kind, const CallExpr *E, AtomicOrdering Ordering)
Utility to insert an atomic instruction based on Intrinsic::ID and the expression node.
#define MMA_VARIANTS_B1_AND(geom, type)
#define MMA_INTR(geom_op_type, layout)
Definition NVPTX.cpp:30
#define MMA_VARIANTS(geom, type)
#define MMA_SATF_VARIANTS(geom, type)
#define MMA_LDST(n, geom_op_type)
Definition NVPTX.cpp:32
#define MMA_VARIANTS_B1_XOR(geom, type)
#define MMA_VARIANTS_I4(geom, type)
Enumerates target-specific builtins in their own namespaces within namespace clang.
@ GE_None
No error.
QualType GetBuiltinType(unsigned ID, GetBuiltinTypeError &Error, unsigned *IntegerConstantArgs=nullptr) const
Return the type for the specified builtin.
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition Expr.h:2946
Expr * getArg(unsigned Arg)
getArg - Return the specified argument.
Definition Expr.h:3150
unsigned getNumArgs() const
getNumArgs - Return the number of actual arguments to this call.
Definition Expr.h:3137
CharUnits - This is an opaque type for sizes expressed in character units.
Definition CharUnits.h:38
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition CharUnits.h:189
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition CharUnits.h:63
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition Address.h:128
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition Address.h:253
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition Address.h:209
llvm::LoadInst * CreateAlignedLoad(llvm::Type *Ty, llvm::Value *Addr, CharUnits Align, const llvm::Twine &Name="")
Definition CGBuilder.h:138
Address CreateAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition CGBuilder.h:199
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertTypeForMem(QualType T)
Address EmitPointerWithAlignment(const Expr *Addr, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitPointerWithAlignment - Given an expression with a pointer type, emit the value and compute our be...
Definition CGExpr.cpp:1591
llvm::Value * EmitScalarExpr(const Expr *E, bool IgnoreResultAssign=false)
EmitScalarExpr - Emit the computation of the specified expression of LLVM scalar type,...
llvm::Value * EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E)
Definition NVPTX.cpp:428
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
ASTContext & getContext() const
CharUnits getNaturalPointeeTypeAlignment(QualType T, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
llvm::Function * getIntrinsic(unsigned IID, ArrayRef< llvm::Type * > Tys={})
std::optional< llvm::APSInt > getIntegerConstantExpr(const ASTContext &Ctx) const
isIntegerConstantExpr - Return the value if this expression is a valid integer constant expression.
SourceLocation getExprLoc() const LLVM_READONLY
getExprLoc - Return the preferred location for the arrow when diagnosing a problem with a generic exp...
Definition Expr.cpp:277
QualType getType() const
Definition Expr.h:144
A (possibly-)qualified type.
Definition TypeBase.h:937
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition Type.cpp:753
QualType getType() const
Definition Value.cpp:237
The JSON file list parser is used to communicate input to InstallAPI.
@ DType
'dtype' clause, an alias for 'device_type', stored separately for diagnostic purposes.
@ Result
The result type of a method or function.
Definition TypeBase.h:905
Diagnostic wrappers for TextAPI types for error reporting.
Definition Dominators.h:30