clang 22.0.0git
AMDGPU.cpp
Go to the documentation of this file.
1//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
15#include "llvm/Analysis/ValueTracking.h"
16#include "llvm/IR/IntrinsicsAMDGPU.h"
17#include "llvm/IR/IntrinsicsR600.h"
18#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
19#include "llvm/Support/AMDGPUAddrSpace.h"
20
21using namespace clang;
22using namespace CodeGen;
23using namespace llvm;
24
25namespace {
26
27// Has second type mangled argument.
28static Value *
30 Intrinsic::ID IntrinsicID,
31 Intrinsic::ID ConstrainedIntrinsicID) {
32 llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
33 llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
34
35 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
36 if (CGF.Builder.getIsFPConstrained()) {
37 Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
38 {Src0->getType(), Src1->getType()});
39 return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1});
40 }
41
42 Function *F =
43 CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()});
44 return CGF.Builder.CreateCall(F, {Src0, Src1});
45}
46
47// If \p E is not null pointer, insert address space cast to match return
48// type of \p E if necessary.
49Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
50 const CallExpr *E = nullptr) {
51 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
52 auto *Call = CGF.Builder.CreateCall(F);
53 Call->addRetAttr(
54 Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
55 Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
56 if (!E)
57 return Call;
58 QualType BuiltinRetType = E->getType();
59 auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
60 if (RetTy == Call->getType())
61 return Call;
62 return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
63}
64
65Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
66 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
67 auto *Call = CGF.Builder.CreateCall(F);
68 Call->addRetAttr(
69 Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
70 Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
71 return Call;
72}
73
74// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
75/// Emit code based on Code Object ABI version.
76/// COV_4 : Emit code to use dispatch ptr
77/// COV_5+ : Emit code to use implicitarg ptr
78/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
79/// and use its value for COV_4 or COV_5+ approach. It is used for
80/// compiling device libraries in an ABI-agnostic way.
81Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
82 llvm::LoadInst *LD;
83
84 auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
85
86 if (Cov == CodeObjectVersionKind::COV_None) {
87 StringRef Name = "__oclc_ABI_version";
88 auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
89 if (!ABIVersionC)
90 ABIVersionC = new llvm::GlobalVariable(
91 CGF.CGM.getModule(), CGF.Int32Ty, false,
92 llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
93 llvm::GlobalVariable::NotThreadLocal,
94 CGF.CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
95
96 // This load will be eliminated by the IPSCCP because it is constant
97 // weak_odr without externally_initialized. Either changing it to weak or
98 // adding externally_initialized will keep the load.
99 Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
100 CGF.CGM.getIntAlign());
101
102 Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
103 ABIVersion,
104 llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5));
105
106 // Indexing the implicit kernarg segment.
107 Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
108 CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
109
110 // Indexing the HSA kernel_dispatch_packet struct.
111 Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
112 CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
113
114 auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
115 LD = CGF.Builder.CreateLoad(
116 Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
117 } else {
118 Value *GEP = nullptr;
119 if (Cov >= CodeObjectVersionKind::COV_5) {
120 // Indexing the implicit kernarg segment.
121 GEP = CGF.Builder.CreateConstGEP1_32(
122 CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
123 } else {
124 // Indexing the HSA kernel_dispatch_packet struct.
125 GEP = CGF.Builder.CreateConstGEP1_32(
126 CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
127 }
128 LD = CGF.Builder.CreateLoad(
130 }
131
132 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
133 llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
134 APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
135 LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
136 LD->setMetadata(llvm::LLVMContext::MD_noundef,
137 llvm::MDNode::get(CGF.getLLVMContext(), {}));
138 LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
139 llvm::MDNode::get(CGF.getLLVMContext(), {}));
140 return LD;
141}
142
143// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
144Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
145 const unsigned XOffset = 12;
146 auto *DP = EmitAMDGPUDispatchPtr(CGF);
147 // Indexing the HSA kernel_dispatch_packet struct.
148 auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
149 auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
150 auto *LD = CGF.Builder.CreateLoad(
152
153 llvm::MDBuilder MDB(CGF.getLLVMContext());
154
155 // Known non-zero.
156 LD->setMetadata(llvm::LLVMContext::MD_range,
157 MDB.createRange(APInt(32, 1), APInt::getZero(32)));
158 LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
159 llvm::MDNode::get(CGF.getLLVMContext(), {}));
160 return LD;
161}
162} // namespace
163
164// Generates the IR for __builtin_read_exec_*.
165// Lowers the builtin to amdgcn_ballot intrinsic.
167 llvm::Type *RegisterType,
168 llvm::Type *ValueType, bool isExecHi) {
169 CodeGen::CGBuilderTy &Builder = CGF.Builder;
170 CodeGen::CodeGenModule &CGM = CGF.CGM;
171
172 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType});
173 llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)});
174
175 if (isExecHi) {
176 Value *Rt2 = Builder.CreateLShr(Call, 32);
177 Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty);
178 return Rt2;
179 }
180
181 return Call;
182}
183
184// Emit an intrinsic that has 1 float or double operand, and 1 integer.
186 const CallExpr *E,
187 unsigned IntrinsicID) {
188 llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
189 llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
190
191 Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
192 return CGF.Builder.CreateCall(F, {Src0, Src1});
193}
194
195// For processing memory ordering and memory scope arguments of various
196// amdgcn builtins.
197// \p Order takes a C++11 comptabile memory-ordering specifier and converts
198// it into LLVM's memory ordering specifier using atomic C ABI, and writes
199// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
200// specific SyncScopeID and writes it to \p SSID.
202 llvm::AtomicOrdering &AO,
203 llvm::SyncScope::ID &SSID) {
204 int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
205
206 // Map C11/C++11 memory ordering to LLVM memory ordering
207 assert(llvm::isValidAtomicOrderingCABI(ord));
208 switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
209 case llvm::AtomicOrderingCABI::acquire:
210 case llvm::AtomicOrderingCABI::consume:
211 AO = llvm::AtomicOrdering::Acquire;
212 break;
213 case llvm::AtomicOrderingCABI::release:
214 AO = llvm::AtomicOrdering::Release;
215 break;
216 case llvm::AtomicOrderingCABI::acq_rel:
217 AO = llvm::AtomicOrdering::AcquireRelease;
218 break;
219 case llvm::AtomicOrderingCABI::seq_cst:
220 AO = llvm::AtomicOrdering::SequentiallyConsistent;
221 break;
222 case llvm::AtomicOrderingCABI::relaxed:
223 AO = llvm::AtomicOrdering::Monotonic;
224 break;
225 }
226
227 // Some of the atomic builtins take the scope as a string name.
228 StringRef scp;
229 if (llvm::getConstantStringInfo(Scope, scp)) {
230 SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
231 return;
232 }
233
234 // Older builtins had an enum argument for the memory scope.
235 int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
236 switch (scope) {
237 case 0: // __MEMORY_SCOPE_SYSTEM
238 SSID = llvm::SyncScope::System;
239 break;
240 case 1: // __MEMORY_SCOPE_DEVICE
241 SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
242 break;
243 case 2: // __MEMORY_SCOPE_WRKGRP
244 SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
245 break;
246 case 3: // __MEMORY_SCOPE_WVFRNT
247 SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
248 break;
249 case 4: // __MEMORY_SCOPE_SINGLE
250 SSID = llvm::SyncScope::SingleThread;
251 break;
252 default:
253 SSID = llvm::SyncScope::System;
254 break;
255 }
256}
257
258llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
259 unsigned Idx,
260 const CallExpr *E) {
261 llvm::Value *Arg = nullptr;
262 if ((ICEArguments & (1 << Idx)) == 0) {
263 Arg = EmitScalarExpr(E->getArg(Idx));
264 } else {
265 // If this is required to be a constant, constant fold it so that we
266 // know that the generated intrinsic gets a ConstantInt.
267 std::optional<llvm::APSInt> Result =
268 E->getArg(Idx)->getIntegerConstantExpr(getContext());
269 assert(Result && "Expected argument to be a constant");
270 Arg = llvm::ConstantInt::get(getLLVMContext(), *Result);
271 }
272 return Arg;
273}
274
276 const CallExpr *E) {
277 constexpr const char *Tag = "amdgpu-synchronize-as";
278
279 LLVMContext &Ctx = Inst->getContext();
281 for (unsigned K = 2; K < E->getNumArgs(); ++K) {
282 llvm::Value *V = EmitScalarExpr(E->getArg(K));
283 StringRef AS;
284 if (llvm::getConstantStringInfo(V, AS)) {
285 MMRAs.push_back({Tag, AS});
286 // TODO: Delete the resulting unused constant?
287 continue;
288 }
290 "expected an address space name as a string literal");
291 }
292
293 llvm::sort(MMRAs);
294 MMRAs.erase(llvm::unique(MMRAs), MMRAs.end());
295 Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
296}
297
299 const CallExpr *E) {
300 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
301 llvm::SyncScope::ID SSID;
302 switch (BuiltinID) {
303 case AMDGPU::BI__builtin_amdgcn_div_scale:
304 case AMDGPU::BI__builtin_amdgcn_div_scalef: {
305 // Translate from the intrinsics's struct return to the builtin's out
306 // argument.
307
308 Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
309
310 llvm::Value *X = EmitScalarExpr(E->getArg(0));
311 llvm::Value *Y = EmitScalarExpr(E->getArg(1));
312 llvm::Value *Z = EmitScalarExpr(E->getArg(2));
313
314 llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
315 X->getType());
316
317 llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
318
319 llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
320 llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
321
322 llvm::Type *RealFlagType = FlagOutPtr.getElementType();
323
324 llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
325 Builder.CreateStore(FlagExt, FlagOutPtr);
326 return Result;
327 }
328 case AMDGPU::BI__builtin_amdgcn_div_fmas:
329 case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
330 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
331 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
332 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
333 llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
334
335 llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
336 Src0->getType());
337 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
338 return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
339 }
340
341 case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
342 return emitBuiltinWithOneOverloadedType<2>(*this, E,
343 Intrinsic::amdgcn_ds_swizzle);
344 case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
345 case AMDGPU::BI__builtin_amdgcn_mov_dpp:
346 case AMDGPU::BI__builtin_amdgcn_update_dpp: {
348 // Find out if any arguments are required to be integer constant
349 // expressions.
350 unsigned ICEArguments = 0;
352 getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
353 assert(Error == ASTContext::GE_None && "Should not codegen an error");
354 llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
355 unsigned Size = DataTy->getPrimitiveSizeInBits();
356 llvm::Type *IntTy =
357 llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
358 Function *F =
359 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
360 ? Intrinsic::amdgcn_mov_dpp8
361 : Intrinsic::amdgcn_update_dpp,
362 IntTy);
363 assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
364 E->getNumArgs() == 2);
365 bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
366 if (InsertOld)
367 Args.push_back(llvm::PoisonValue::get(IntTy));
368 for (unsigned I = 0; I != E->getNumArgs(); ++I) {
369 llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
370 if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
371 Size < 32) {
372 if (!DataTy->isIntegerTy())
373 V = Builder.CreateBitCast(
374 V, llvm::IntegerType::get(Builder.getContext(), Size));
375 V = Builder.CreateZExtOrBitCast(V, IntTy);
376 }
377 llvm::Type *ExpTy =
378 F->getFunctionType()->getFunctionParamType(I + InsertOld);
379 Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy));
380 }
381 Value *V = Builder.CreateCall(F, Args);
382 if (Size < 32 && !DataTy->isIntegerTy())
383 V = Builder.CreateTrunc(
384 V, llvm::IntegerType::get(Builder.getContext(), Size));
385 return Builder.CreateTruncOrBitCast(V, DataTy);
386 }
387 case AMDGPU::BI__builtin_amdgcn_permlane16:
388 case AMDGPU::BI__builtin_amdgcn_permlanex16:
389 return emitBuiltinWithOneOverloadedType<6>(
390 *this, E,
391 BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
392 ? Intrinsic::amdgcn_permlane16
393 : Intrinsic::amdgcn_permlanex16);
394 case AMDGPU::BI__builtin_amdgcn_permlane64:
395 return emitBuiltinWithOneOverloadedType<1>(*this, E,
396 Intrinsic::amdgcn_permlane64);
397 case AMDGPU::BI__builtin_amdgcn_readlane:
398 return emitBuiltinWithOneOverloadedType<2>(*this, E,
399 Intrinsic::amdgcn_readlane);
400 case AMDGPU::BI__builtin_amdgcn_readfirstlane:
401 return emitBuiltinWithOneOverloadedType<1>(*this, E,
402 Intrinsic::amdgcn_readfirstlane);
403 case AMDGPU::BI__builtin_amdgcn_div_fixup:
404 case AMDGPU::BI__builtin_amdgcn_div_fixupf:
405 case AMDGPU::BI__builtin_amdgcn_div_fixuph:
406 return emitBuiltinWithOneOverloadedType<3>(*this, E,
407 Intrinsic::amdgcn_div_fixup);
408 case AMDGPU::BI__builtin_amdgcn_trig_preop:
409 case AMDGPU::BI__builtin_amdgcn_trig_preopf:
410 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
411 case AMDGPU::BI__builtin_amdgcn_rcp:
412 case AMDGPU::BI__builtin_amdgcn_rcpf:
413 case AMDGPU::BI__builtin_amdgcn_rcph:
414 case AMDGPU::BI__builtin_amdgcn_rcp_bf16:
415 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rcp);
416 case AMDGPU::BI__builtin_amdgcn_sqrt:
417 case AMDGPU::BI__builtin_amdgcn_sqrtf:
418 case AMDGPU::BI__builtin_amdgcn_sqrth:
419 case AMDGPU::BI__builtin_amdgcn_sqrt_bf16:
420 return emitBuiltinWithOneOverloadedType<1>(*this, E,
421 Intrinsic::amdgcn_sqrt);
422 case AMDGPU::BI__builtin_amdgcn_rsq:
423 case AMDGPU::BI__builtin_amdgcn_rsqf:
424 case AMDGPU::BI__builtin_amdgcn_rsqh:
425 case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
426 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq);
427 case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
428 case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
429 return emitBuiltinWithOneOverloadedType<1>(*this, E,
430 Intrinsic::amdgcn_rsq_clamp);
431 case AMDGPU::BI__builtin_amdgcn_sinf:
432 case AMDGPU::BI__builtin_amdgcn_sinh:
433 case AMDGPU::BI__builtin_amdgcn_sin_bf16:
434 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin);
435 case AMDGPU::BI__builtin_amdgcn_cosf:
436 case AMDGPU::BI__builtin_amdgcn_cosh:
437 case AMDGPU::BI__builtin_amdgcn_cos_bf16:
438 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos);
439 case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
440 return EmitAMDGPUDispatchPtr(*this, E);
441 case AMDGPU::BI__builtin_amdgcn_logf:
442 case AMDGPU::BI__builtin_amdgcn_log_bf16:
443 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
444 case AMDGPU::BI__builtin_amdgcn_exp2f:
445 case AMDGPU::BI__builtin_amdgcn_exp2_bf16:
446 return emitBuiltinWithOneOverloadedType<1>(*this, E,
447 Intrinsic::amdgcn_exp2);
448 case AMDGPU::BI__builtin_amdgcn_log_clampf:
449 return emitBuiltinWithOneOverloadedType<1>(*this, E,
450 Intrinsic::amdgcn_log_clamp);
451 case AMDGPU::BI__builtin_amdgcn_ldexp:
452 case AMDGPU::BI__builtin_amdgcn_ldexpf: {
453 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
454 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
455 llvm::Function *F =
456 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
457 return Builder.CreateCall(F, {Src0, Src1});
458 }
459 case AMDGPU::BI__builtin_amdgcn_ldexph: {
460 // The raw instruction has a different behavior for out of bounds exponent
461 // values (implicit truncation instead of saturate to short_min/short_max).
462 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
463 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
464 llvm::Function *F =
465 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
466 return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)});
467 }
468 case AMDGPU::BI__builtin_amdgcn_frexp_mant:
469 case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
470 case AMDGPU::BI__builtin_amdgcn_frexp_manth:
471 return emitBuiltinWithOneOverloadedType<1>(*this, E,
472 Intrinsic::amdgcn_frexp_mant);
473 case AMDGPU::BI__builtin_amdgcn_frexp_exp:
474 case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
475 Value *Src0 = EmitScalarExpr(E->getArg(0));
476 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
477 { Builder.getInt32Ty(), Src0->getType() });
478 return Builder.CreateCall(F, Src0);
479 }
480 case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
481 Value *Src0 = EmitScalarExpr(E->getArg(0));
482 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
483 { Builder.getInt16Ty(), Src0->getType() });
484 return Builder.CreateCall(F, Src0);
485 }
486 case AMDGPU::BI__builtin_amdgcn_fract:
487 case AMDGPU::BI__builtin_amdgcn_fractf:
488 case AMDGPU::BI__builtin_amdgcn_fracth:
489 return emitBuiltinWithOneOverloadedType<1>(*this, E,
490 Intrinsic::amdgcn_fract);
491 case AMDGPU::BI__builtin_amdgcn_lerp:
492 return emitBuiltinWithOneOverloadedType<3>(*this, E,
493 Intrinsic::amdgcn_lerp);
494 case AMDGPU::BI__builtin_amdgcn_ubfe:
495 return emitBuiltinWithOneOverloadedType<3>(*this, E,
496 Intrinsic::amdgcn_ubfe);
497 case AMDGPU::BI__builtin_amdgcn_sbfe:
498 return emitBuiltinWithOneOverloadedType<3>(*this, E,
499 Intrinsic::amdgcn_sbfe);
500 case AMDGPU::BI__builtin_amdgcn_ballot_w32:
501 case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
502 llvm::Type *ResultType = ConvertType(E->getType());
503 llvm::Value *Src = EmitScalarExpr(E->getArg(0));
504 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
505 return Builder.CreateCall(F, { Src });
506 }
507 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
508 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
509 llvm::Value *Src = EmitScalarExpr(E->getArg(0));
510 Function *F =
511 CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, {Src->getType()});
512 return Builder.CreateCall(F, {Src});
513 }
514 case AMDGPU::BI__builtin_amdgcn_tanhf:
515 case AMDGPU::BI__builtin_amdgcn_tanhh:
516 case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
517 return emitBuiltinWithOneOverloadedType<1>(*this, E,
518 Intrinsic::amdgcn_tanh);
519 case AMDGPU::BI__builtin_amdgcn_uicmp:
520 case AMDGPU::BI__builtin_amdgcn_uicmpl:
521 case AMDGPU::BI__builtin_amdgcn_sicmp:
522 case AMDGPU::BI__builtin_amdgcn_sicmpl: {
523 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
524 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
525 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
526
527 // FIXME-GFX10: How should 32 bit mask be handled?
528 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
529 { Builder.getInt64Ty(), Src0->getType() });
530 return Builder.CreateCall(F, { Src0, Src1, Src2 });
531 }
532 case AMDGPU::BI__builtin_amdgcn_fcmp:
533 case AMDGPU::BI__builtin_amdgcn_fcmpf: {
534 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
535 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
536 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
537
538 // FIXME-GFX10: How should 32 bit mask be handled?
539 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
540 { Builder.getInt64Ty(), Src0->getType() });
541 return Builder.CreateCall(F, { Src0, Src1, Src2 });
542 }
543 case AMDGPU::BI__builtin_amdgcn_class:
544 case AMDGPU::BI__builtin_amdgcn_classf:
545 case AMDGPU::BI__builtin_amdgcn_classh:
546 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
547 case AMDGPU::BI__builtin_amdgcn_fmed3f:
548 case AMDGPU::BI__builtin_amdgcn_fmed3h:
549 return emitBuiltinWithOneOverloadedType<3>(*this, E,
550 Intrinsic::amdgcn_fmed3);
551 case AMDGPU::BI__builtin_amdgcn_ds_append:
552 case AMDGPU::BI__builtin_amdgcn_ds_consume: {
553 Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
554 Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
555 Value *Src0 = EmitScalarExpr(E->getArg(0));
556 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
557 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
558 }
559 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
560 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
561 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
562 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
563 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
564 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
565 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
566 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
567 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
568 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
569 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
570 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
571 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
572 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
573 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
574 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
575 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
576 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
577 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
578 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
579 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
580 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
581 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
582 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
583 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
584 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
585 Intrinsic::ID IID;
586 switch (BuiltinID) {
587 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
588 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
589 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
590 IID = Intrinsic::amdgcn_global_load_tr_b64;
591 break;
592 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
593 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
594 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
595 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
596 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
597 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
598 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
599 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
600 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
601 IID = Intrinsic::amdgcn_global_load_tr_b128;
602 break;
603 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
604 IID = Intrinsic::amdgcn_global_load_tr4_b64;
605 break;
606 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
607 IID = Intrinsic::amdgcn_global_load_tr6_b96;
608 break;
609 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
610 IID = Intrinsic::amdgcn_ds_load_tr4_b64;
611 break;
612 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
613 IID = Intrinsic::amdgcn_ds_load_tr6_b96;
614 break;
615 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
616 IID = Intrinsic::amdgcn_ds_load_tr8_b64;
617 break;
618 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
619 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
620 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
621 IID = Intrinsic::amdgcn_ds_load_tr16_b128;
622 break;
623 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
624 IID = Intrinsic::amdgcn_ds_read_tr4_b64;
625 break;
626 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
627 IID = Intrinsic::amdgcn_ds_read_tr8_b64;
628 break;
629 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
630 IID = Intrinsic::amdgcn_ds_read_tr6_b96;
631 break;
632 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
633 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
634 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
635 IID = Intrinsic::amdgcn_ds_read_tr16_b64;
636 break;
637 }
638 llvm::Type *LoadTy = ConvertType(E->getType());
639 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
640 llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
641 return Builder.CreateCall(F, {Addr});
642 }
643 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
644 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
645 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
646 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
647 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
648 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128: {
649
650 Intrinsic::ID IID;
651 switch (BuiltinID) {
652 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
653 IID = Intrinsic::amdgcn_global_load_monitor_b32;
654 break;
655 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
656 IID = Intrinsic::amdgcn_global_load_monitor_b64;
657 break;
658 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
659 IID = Intrinsic::amdgcn_global_load_monitor_b128;
660 break;
661 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
662 IID = Intrinsic::amdgcn_flat_load_monitor_b32;
663 break;
664 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
665 IID = Intrinsic::amdgcn_flat_load_monitor_b64;
666 break;
667 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128:
668 IID = Intrinsic::amdgcn_flat_load_monitor_b128;
669 break;
670 }
671
672 llvm::Type *LoadTy = ConvertType(E->getType());
673 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
674 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
675 llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
676 return Builder.CreateCall(F, {Addr, Val});
677 }
678 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
679 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
680 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: {
681 Intrinsic::ID IID;
682 switch (BuiltinID) {
683 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
684 IID = Intrinsic::amdgcn_cluster_load_b32;
685 break;
686 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
687 IID = Intrinsic::amdgcn_cluster_load_b64;
688 break;
689 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128:
690 IID = Intrinsic::amdgcn_cluster_load_b128;
691 break;
692 }
694 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
695 Args.push_back(EmitScalarExpr(E->getArg(i)));
696 llvm::Function *F = CGM.getIntrinsic(IID, {ConvertType(E->getType())});
697 return Builder.CreateCall(F, {Args});
698 }
699 case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
700 // Should this have asan instrumentation?
701 return emitBuiltinWithOneOverloadedType<5>(*this, E,
702 Intrinsic::amdgcn_load_to_lds);
703 }
704 case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
705 Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
706 {llvm::Type::getInt64Ty(getLLVMContext())});
707 return Builder.CreateCall(F);
708 }
709 case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
710 Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv,
711 {llvm::Type::getInt64Ty(getLLVMContext())});
712 llvm::Value *Env = EmitScalarExpr(E->getArg(0));
713 return Builder.CreateCall(F, {Env});
714 }
715 case AMDGPU::BI__builtin_amdgcn_read_exec:
716 return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
717 case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
718 return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
719 case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
720 return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
721 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
722 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
723 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
724 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
725 llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
726 llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
727 llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2));
728 llvm::Value *RayDir = EmitScalarExpr(E->getArg(3));
729 llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
730 llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));
731
732 // The builtins take these arguments as vec4 where the last element is
733 // ignored. The intrinsic takes them as vec3.
734 RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
735 {0, 1, 2});
736 RayDir =
737 Builder.CreateShuffleVector(RayDir, RayDir, {0, 1, 2});
738 RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
739 {0, 1, 2});
740
741 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
742 {NodePtr->getType(), RayDir->getType()});
743 return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
744 RayInverseDir, TextureDescr});
745 }
746 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
747 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
748 Intrinsic::ID IID;
749 switch (BuiltinID) {
750 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
751 IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
752 break;
753 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
754 IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
755 break;
756 }
757 llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
758 llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
759 llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2));
760 llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3));
761 llvm::Value *RayDir = EmitScalarExpr(E->getArg(4));
762 llvm::Value *Offset = EmitScalarExpr(E->getArg(5));
763 llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6));
764
765 Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7));
766 Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8));
767
768 llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
769
770 llvm::CallInst *CI = Builder.CreateCall(
771 IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
772 Offset, TextureDescr});
773
774 llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0);
775 llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1);
776 llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2);
777
778 Builder.CreateStore(RetRayOrigin, RetRayOriginPtr);
779 Builder.CreateStore(RetRayDir, RetRayDirPtr);
780
781 return RetVData;
782 }
783
784 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
785 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
786 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
787 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
788 Intrinsic::ID IID;
789 switch (BuiltinID) {
790 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
791 IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
792 break;
793 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
794 IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
795 break;
796 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
797 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
798 break;
799 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
800 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
801 break;
802 }
803
805 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
806 Args.push_back(EmitScalarExpr(E->getArg(i)));
807
808 Function *F = CGM.getIntrinsic(IID);
809 Value *Call = Builder.CreateCall(F, Args);
810 Value *Rtn = Builder.CreateExtractValue(Call, 0);
811 Value *A = Builder.CreateExtractValue(Call, 1);
812 llvm::Type *RetTy = ConvertType(E->getType());
813 Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
814 (uint64_t)0);
815 // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
816 // <2 x i64>, zext the second value.
817 if (A->getType()->getPrimitiveSizeInBits() <
818 RetTy->getScalarType()->getPrimitiveSizeInBits())
819 A = Builder.CreateZExt(A, RetTy->getScalarType());
820
821 return Builder.CreateInsertElement(I0, A, 1);
822 }
823 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
824 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
825 llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8);
827 BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
828 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
829 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
830 {VT, VT});
831
833 for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
834 Args.push_back(EmitScalarExpr(E->getArg(I)));
835 return Builder.CreateCall(F, Args);
836 }
837 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
838 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
839 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
840 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
841 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
842 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
843 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
844 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
845 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
846 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
847 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
848 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
849 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
850 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
851 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
852 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
853 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
854 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
855 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
856 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
857 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
858 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
859 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
860 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
861 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
862 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
863 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
864 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
865 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
866 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
867 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
868 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
869 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
870 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
871 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
872 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
873 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
874 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
875 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
876 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
877 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
878 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
879 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
880 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
881 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
882 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
883 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
884 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
885 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
886 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
887 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
888 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
889 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
890 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
891 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
892 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
893 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
894 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
895 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
896 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
897 // GFX1250 WMMA builtins
898 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
899 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
900 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
901 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
902 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
903 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
904 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
905 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
906 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
907 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
908 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
909 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
910 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
911 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
912 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
913 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
914 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
915 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
916 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
917 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
918 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
919 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
920 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
921 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
922 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
923 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
924 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
925 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
926 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
927 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
928 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
929 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
930 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
931 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
932 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
933 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
934 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
935 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
936 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
937 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
938 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
939 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
940 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
941
942 // These operations perform a matrix multiplication and accumulation of
943 // the form:
944 // D = A * B + C
945 // We need to specify one type for matrices AB and one for matrices CD.
946 // Sparse matrix operations can have different types for A and B as well as
947 // an additional type for sparsity index.
948 // Destination type should be put before types used for source operands.
949 SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
950 // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
951 // There is no need for the variable opsel argument, so always set it to
952 // "false".
953 bool AppendFalseForOpselArg = false;
954 unsigned BuiltinWMMAOp;
955 // Need return type when D and C are of different types.
956 bool NeedReturnType = false;
957
958 switch (BuiltinID) {
959 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
960 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
961 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
962 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
963 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
964 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
965 break;
966 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
967 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
968 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
969 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
970 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
971 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
972 break;
973 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
974 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
975 AppendFalseForOpselArg = true;
976 [[fallthrough]];
977 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
978 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
979 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
980 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
981 break;
982 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
983 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
984 AppendFalseForOpselArg = true;
985 [[fallthrough]];
986 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
987 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
988 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
989 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
990 break;
991 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
992 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
993 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
994 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
995 break;
996 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
997 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
998 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
999 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
1000 break;
1001 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1002 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1003 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1004 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
1005 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1006 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
1007 break;
1008 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1009 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1010 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1011 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
1012 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1013 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
1014 break;
1015 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1016 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
1017 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1018 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
1019 break;
1020 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1021 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
1022 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1023 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
1024 break;
1025 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1026 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
1027 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1028 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
1029 break;
1030 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1031 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
1032 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1033 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
1034 break;
1035 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1036 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
1037 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1038 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
1039 break;
1040 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1041 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
1042 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1043 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
1044 break;
1045 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1046 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
1047 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1048 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
1049 break;
1050 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1051 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
1052 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1053 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
1054 break;
1055 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1056 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1057 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1058 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
1059 break;
1060 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1061 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1062 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1063 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
1064 break;
1065 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1066 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1067 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1068 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
1069 break;
1070 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1071 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1072 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1073 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
1074 break;
1075 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1076 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1077 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1078 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
1079 break;
1080 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1081 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1082 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1083 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
1084 break;
1085 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1086 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1087 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1088 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
1089 break;
1090 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1091 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1092 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1093 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
1094 break;
1095 // GFX1250 WMMA builtins
1096 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1097 ArgsForMatchingMatrixTypes = {5, 1};
1098 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
1099 break;
1100 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1101 ArgsForMatchingMatrixTypes = {5, 1};
1102 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
1103 break;
1104 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1105 ArgsForMatchingMatrixTypes = {5, 1};
1106 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
1107 break;
1108 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1109 ArgsForMatchingMatrixTypes = {5, 1};
1110 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
1111 break;
1112 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1113 ArgsForMatchingMatrixTypes = {5, 1};
1114 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
1115 break;
1116 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1117 NeedReturnType = true;
1118 ArgsForMatchingMatrixTypes = {1, 5};
1119 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
1120 break;
1121 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1122 ArgsForMatchingMatrixTypes = {3, 0};
1123 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
1124 break;
1125 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1126 ArgsForMatchingMatrixTypes = {3, 0};
1127 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
1128 break;
1129 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1130 ArgsForMatchingMatrixTypes = {3, 0};
1131 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
1132 break;
1133 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1134 ArgsForMatchingMatrixTypes = {3, 0};
1135 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
1136 break;
1137 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1138 ArgsForMatchingMatrixTypes = {3, 0};
1139 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
1140 break;
1141 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1142 ArgsForMatchingMatrixTypes = {3, 0};
1143 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
1144 break;
1145 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1146 ArgsForMatchingMatrixTypes = {3, 0};
1147 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
1148 break;
1149 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1150 ArgsForMatchingMatrixTypes = {3, 0};
1151 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
1152 break;
1153 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1154 ArgsForMatchingMatrixTypes = {3, 0};
1155 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
1156 break;
1157 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1158 ArgsForMatchingMatrixTypes = {3, 0};
1159 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
1160 break;
1161 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1162 ArgsForMatchingMatrixTypes = {3, 0};
1163 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
1164 break;
1165 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1166 ArgsForMatchingMatrixTypes = {3, 0};
1167 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
1168 break;
1169 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1170 ArgsForMatchingMatrixTypes = {3, 0};
1171 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
1172 break;
1173 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1174 ArgsForMatchingMatrixTypes = {3, 0};
1175 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
1176 break;
1177 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1178 ArgsForMatchingMatrixTypes = {3, 0};
1179 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
1180 break;
1181 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1182 ArgsForMatchingMatrixTypes = {3, 0};
1183 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
1184 break;
1185 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1186 ArgsForMatchingMatrixTypes = {4, 1};
1187 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
1188 break;
1189 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1190 ArgsForMatchingMatrixTypes = {5, 1, 3};
1191 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4;
1192 break;
1193 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1194 ArgsForMatchingMatrixTypes = {5, 1, 3};
1195 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4;
1196 break;
1197 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1198 ArgsForMatchingMatrixTypes = {5, 1, 3};
1199 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4;
1200 break;
1201 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1202 ArgsForMatchingMatrixTypes = {3, 0, 1};
1203 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
1204 break;
1205 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1206 ArgsForMatchingMatrixTypes = {3, 0, 1};
1207 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4;
1208 break;
1209 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1210 ArgsForMatchingMatrixTypes = {3, 0, 1};
1211 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4;
1212 break;
1213 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1214 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1215 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
1216 break;
1217 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1218 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1219 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
1220 break;
1221 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1222 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1223 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
1224 break;
1225 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1226 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1227 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
1228 break;
1229 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1230 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1231 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
1232 break;
1233 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1234 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1235 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
1236 break;
1237 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1238 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1239 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
1240 break;
1241 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1242 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1243 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
1244 break;
1245 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1246 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1247 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
1248 break;
1249 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1250 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1251 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
1252 break;
1253 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1254 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1255 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
1256 break;
1257 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1258 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1259 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
1260 break;
1261 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1262 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1263 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
1264 break;
1265 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
1266 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1267 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
1268 break;
1269 }
1270
1272 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
1273 Args.push_back(EmitScalarExpr(E->getArg(i)));
1274 if (AppendFalseForOpselArg)
1275 Args.push_back(Builder.getFalse());
1276
1278 if (NeedReturnType)
1279 ArgTypes.push_back(ConvertType(E->getType()));
1280 for (auto ArgIdx : ArgsForMatchingMatrixTypes)
1281 ArgTypes.push_back(Args[ArgIdx]->getType());
1282
1283 Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
1284 return Builder.CreateCall(F, Args);
1285 }
1286 // amdgcn workgroup size
1287 case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
1288 return EmitAMDGPUWorkGroupSize(*this, 0);
1289 case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
1290 return EmitAMDGPUWorkGroupSize(*this, 1);
1291 case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
1292 return EmitAMDGPUWorkGroupSize(*this, 2);
1293
1294 // amdgcn grid size
1295 case AMDGPU::BI__builtin_amdgcn_grid_size_x:
1296 return EmitAMDGPUGridSize(*this, 0);
1297 case AMDGPU::BI__builtin_amdgcn_grid_size_y:
1298 return EmitAMDGPUGridSize(*this, 1);
1299 case AMDGPU::BI__builtin_amdgcn_grid_size_z:
1300 return EmitAMDGPUGridSize(*this, 2);
1301
1302 // r600 intrinsics
1303 case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
1304 case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
1305 return emitBuiltinWithOneOverloadedType<1>(*this, E,
1306 Intrinsic::r600_recipsqrt_ieee);
1307 case AMDGPU::BI__builtin_amdgcn_alignbit: {
1308 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
1309 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
1310 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
1311 Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
1312 return Builder.CreateCall(F, { Src0, Src1, Src2 });
1313 }
1314 case AMDGPU::BI__builtin_amdgcn_fence: {
1316 EmitScalarExpr(E->getArg(1)), AO, SSID);
1317 FenceInst *Fence = Builder.CreateFence(AO, SSID);
1318 if (E->getNumArgs() > 2)
1320 return Fence;
1321 }
1322 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1323 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1324 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1325 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1326 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1327 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1328 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1329 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1330 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1331 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1332 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1333 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1334 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1335 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1336 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1337 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1338 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1339 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1340 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1341 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1342 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1343 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1344 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1345 llvm::AtomicRMWInst::BinOp BinOp;
1346 switch (BuiltinID) {
1347 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1348 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1349 BinOp = llvm::AtomicRMWInst::UIncWrap;
1350 break;
1351 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1352 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1353 BinOp = llvm::AtomicRMWInst::UDecWrap;
1354 break;
1355 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1356 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1357 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1358 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1359 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1360 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1361 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1362 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1363 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1364 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1365 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1366 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1367 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1368 BinOp = llvm::AtomicRMWInst::FAdd;
1369 break;
1370 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1371 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1372 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1373 BinOp = llvm::AtomicRMWInst::FMin;
1374 break;
1375 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1376 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1377 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1378 BinOp = llvm::AtomicRMWInst::FMax;
1379 break;
1380 }
1381
1382 Address Ptr = CheckAtomicAlignment(*this, E);
1383 Value *Val = EmitScalarExpr(E->getArg(1));
1384 llvm::Type *OrigTy = Val->getType();
1385 QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
1386
1387 bool Volatile;
1388
1389 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
1390 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
1391 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1392 // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1393 Volatile =
1394 cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
1395 } else {
1396 // Infer volatile from the passed type.
1397 Volatile =
1399 }
1400
1401 if (E->getNumArgs() >= 4) {
1402 // Some of the builtins have explicit ordering and scope arguments.
1404 EmitScalarExpr(E->getArg(3)), AO, SSID);
1405 } else {
1406 // Most of the builtins do not have syncscope/order arguments. For DS
1407 // atomics the scope doesn't really matter, as they implicitly operate at
1408 // workgroup scope.
1409 //
1410 // The global/flat cases need to use agent scope to consistently produce
1411 // the native instruction instead of a cmpxchg expansion.
1412 SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1413 AO = AtomicOrdering::Monotonic;
1414
1415 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
1416 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
1417 BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
1418 BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1419 llvm::Type *V2BF16Ty = FixedVectorType::get(
1420 llvm::Type::getBFloatTy(Builder.getContext()), 2);
1421 Val = Builder.CreateBitCast(Val, V2BF16Ty);
1422 }
1423 }
1424
1425 llvm::AtomicRMWInst *RMW =
1426 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1427 if (Volatile)
1428 RMW->setVolatile(true);
1429
1430 unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1431 if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1432 // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1433 // instruction for flat and global operations.
1434 llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
1435 RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
1436
1437 // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1438 // instruction, but this only matters for float fadd.
1439 if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1440 RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
1441 }
1442
1443 return Builder.CreateBitCast(RMW, OrigTy);
1444 }
1445 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1446 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1447 llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
1448 llvm::Type *ResultType = ConvertType(E->getType());
1449 // s_sendmsg_rtn is mangled using return type only.
1450 Function *F =
1451 CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
1452 return Builder.CreateCall(F, {Arg});
1453 }
1454 case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1455 case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1456 // Because builtin types are limited, and the intrinsic uses a struct/pair
1457 // output, marshal the pair-of-i32 to <2 x i32>.
1458 Value *VDstOld = EmitScalarExpr(E->getArg(0));
1459 Value *VSrcOld = EmitScalarExpr(E->getArg(1));
1460 Value *FI = EmitScalarExpr(E->getArg(2));
1461 Value *BoundCtrl = EmitScalarExpr(E->getArg(3));
1462 Function *F =
1463 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1464 ? Intrinsic::amdgcn_permlane16_swap
1465 : Intrinsic::amdgcn_permlane32_swap);
1466 llvm::CallInst *Call =
1467 Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl});
1468
1469 llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0);
1470 llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1);
1471
1472 llvm::Type *ResultType = ConvertType(E->getType());
1473
1474 llvm::Value *Insert0 = Builder.CreateInsertElement(
1475 llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0));
1476 llvm::Value *AsVector =
1477 Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1));
1478 return AsVector;
1479 }
1480 case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1481 case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1482 return emitBuiltinWithOneOverloadedType<4>(*this, E,
1483 Intrinsic::amdgcn_bitop3);
1484 case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1485 // TODO: LLVM has this overloaded to allow for fat pointers, but since
1486 // those haven't been plumbed through to Clang yet, default to creating the
1487 // resource type.
1489 for (unsigned I = 0; I < 4; ++I)
1490 Args.push_back(EmitScalarExpr(E->getArg(I)));
1491 llvm::PointerType *RetTy = llvm::PointerType::get(
1492 Builder.getContext(), llvm::AMDGPUAS::BUFFER_RESOURCE);
1493 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc,
1494 {RetTy, Args[0]->getType()});
1495 return Builder.CreateCall(F, Args);
1496 }
1497 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1498 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1499 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1500 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1501 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1502 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1503 return emitBuiltinWithOneOverloadedType<5>(
1504 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store);
1505 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1506 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1507 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1508 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1509 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1510 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1511 llvm::Type *RetTy = nullptr;
1512 switch (BuiltinID) {
1513 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1514 RetTy = Int8Ty;
1515 break;
1516 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1517 RetTy = Int16Ty;
1518 break;
1519 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1520 RetTy = Int32Ty;
1521 break;
1522 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1523 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/2);
1524 break;
1525 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1526 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/3);
1527 break;
1528 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1529 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/4);
1530 break;
1531 }
1532 Function *F =
1533 CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy);
1534 return Builder.CreateCall(
1535 F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
1536 EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
1537 }
1538 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32:
1539 return emitBuiltinWithOneOverloadedType<5>(
1540 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_add);
1541 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32:
1542 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16:
1543 return emitBuiltinWithOneOverloadedType<5>(
1544 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd);
1545 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32:
1546 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f64:
1547 return emitBuiltinWithOneOverloadedType<5>(
1548 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin);
1549 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32:
1550 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f64:
1551 return emitBuiltinWithOneOverloadedType<5>(
1552 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax);
1553 case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1554 return emitBuiltinWithOneOverloadedType<2>(
1555 *this, E, Intrinsic::amdgcn_s_prefetch_data);
1556 case Builtin::BIlogbf:
1557 case Builtin::BI__builtin_logbf: {
1558 Value *Src0 = EmitScalarExpr(E->getArg(0));
1559 Function *FrExpFunc = CGM.getIntrinsic(
1560 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1561 CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0);
1562 Value *Exp = Builder.CreateExtractValue(FrExp, 1);
1563 Value *Add = Builder.CreateAdd(
1564 Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true);
1565 Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getFloatTy());
1566 Value *Fabs =
1567 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1568 Value *FCmpONE = Builder.CreateFCmpONE(
1569 Fabs, ConstantFP::getInfinity(Builder.getFloatTy()));
1570 Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs);
1571 Value *FCmpOEQ =
1572 Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getFloatTy()));
1573 Value *Sel2 = Builder.CreateSelect(
1574 FCmpOEQ,
1575 ConstantFP::getInfinity(Builder.getFloatTy(), /*Negative=*/true), Sel1);
1576 return Sel2;
1577 }
1578 case Builtin::BIlogb:
1579 case Builtin::BI__builtin_logb: {
1580 Value *Src0 = EmitScalarExpr(E->getArg(0));
1581 Function *FrExpFunc = CGM.getIntrinsic(
1582 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1583 CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0);
1584 Value *Exp = Builder.CreateExtractValue(FrExp, 1);
1585 Value *Add = Builder.CreateAdd(
1586 Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true);
1587 Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getDoubleTy());
1588 Value *Fabs =
1589 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1590 Value *FCmpONE = Builder.CreateFCmpONE(
1591 Fabs, ConstantFP::getInfinity(Builder.getDoubleTy()));
1592 Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs);
1593 Value *FCmpOEQ =
1594 Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getDoubleTy()));
1595 Value *Sel2 = Builder.CreateSelect(
1596 FCmpOEQ,
1597 ConstantFP::getInfinity(Builder.getDoubleTy(), /*Negative=*/true),
1598 Sel1);
1599 return Sel2;
1600 }
1601 case Builtin::BIscalbnf:
1602 case Builtin::BI__builtin_scalbnf:
1603 case Builtin::BIscalbn:
1604 case Builtin::BI__builtin_scalbn:
1606 *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp);
1607 default:
1608 return nullptr;
1609 }
1610}
#define V(N, I)
Definition: ASTContext.h:3597
static Value * emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E, Intrinsic::ID IntrinsicID, Intrinsic::ID ConstrainedIntrinsicID)
Definition: CGBuiltin.cpp:623
Address CheckAtomicAlignment(CodeGenFunction &CGF, const CallExpr *E)
Definition: CGBuiltin.cpp:274
Expr * E
static Value * EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E, llvm::Type *RegisterType, llvm::Type *ValueType, bool isExecHi)
Definition: AMDGPU.cpp:166
static Value * emitFPIntBuiltin(CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID)
Definition: AMDGPU.cpp:185
const Environment & Env
Definition: HTMLLogger.cpp:147
#define X(type, name)
Definition: Value.h:145
HLSLResourceBindingAttr::RegisterType RegisterType
Definition: SemaHLSL.cpp:55
static QualType getPointeeType(const MemRegion *R)
Enumerates target-specific builtins in their own namespaces within namespace clang.
QualType GetBuiltinType(unsigned ID, GetBuiltinTypeError &Error, unsigned *IntegerConstantArgs=nullptr) const
Return the type for the specified builtin.
unsigned getTargetAddressSpace(LangAS AS) const
@ GE_None
No error.
Definition: ASTContext.h:2536
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2879
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:128
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:209
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:204
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:140
Address CreateGEP(CodeGenFunction &CGF, Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition: CGBuilder.h:296
llvm::AtomicRMWInst * CreateAtomicRMW(llvm::AtomicRMWInst::BinOp Op, Address Addr, llvm::Value *Val, llvm::AtomicOrdering Ordering, llvm::SyncScope::ID SSID=llvm::SyncScope::System)
Definition: CGBuilder.h:184
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:112
llvm::LoadInst * CreateAlignedLoad(llvm::Type *Ty, llvm::Value *Addr, CharUnits Align, const llvm::Twine &Name="")
Definition: CGBuilder.h:132
Address CreateAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:193
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Value * EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx, const CallExpr *E)
Definition: AMDGPU.cpp:258
llvm::Type * ConvertType(QualType T)
llvm::Value * EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E)
Definition: AMDGPU.cpp:298
const TargetInfo & getTarget() const
void AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, const CallExpr *E)
Definition: AMDGPU.cpp:275
Address EmitPointerWithAlignment(const Expr *Addr, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitPointerWithAlignment - Given an expression with a pointer type, emit the value and compute our be...
Definition: CGExpr.cpp:1515
llvm::Value * EmitScalarExpr(const Expr *E, bool IgnoreResultAssign=false)
EmitScalarExpr - Emit the computation of the specified expression of LLVM scalar type,...
llvm::LLVMContext & getLLVMContext()
void ProcessOrderScopeAMDGCN(llvm::Value *Order, llvm::Value *Scope, llvm::AtomicOrdering &AO, llvm::SyncScope::ID &SSID)
Definition: AMDGPU.cpp:201
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
ASTContext & getContext() const
llvm::Function * getIntrinsic(unsigned IID, ArrayRef< llvm::Type * > Tys={})
std::optional< llvm::APSInt > getIntegerConstantExpr(const ASTContext &Ctx) const
isIntegerConstantExpr - Return the value if this expression is a valid integer constant expression.
Expr * IgnoreImpCasts() LLVM_READONLY
Skip past any implicit casts which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3053
SourceLocation getExprLoc() const LLVM_READONLY
getExprLoc - Return the preferred location for the arrow when diagnosing a problem with a generic exp...
Definition: Expr.cpp:273
QualType getType() const
Definition: Expr.h:144
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: TypeBase.h:3346
A (possibly-)qualified type.
Definition: TypeBase.h:937
bool isVolatileQualified() const
Determine whether this type is volatile-qualified.
Definition: TypeBase.h:8427
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:41
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:323
unsigned getMaxOpenCLWorkGroupSize() const
Definition: TargetInfo.h:870
llvm::CodeObjectVersionKind CodeObjectVersion
Code object version for AMDGPU.
Definition: TargetOptions.h:79
const T * castAs() const
Member-template castAs<specific type>.
Definition: TypeBase.h:9226
QualType getType() const
Definition: Value.cpp:237
llvm::APInt APInt
Definition: FixedPoint.h:19
The JSON file list parser is used to communicate input to InstallAPI.
@ Result
The result type of a method or function.
Diagnostic wrappers for TextAPI types for error reporting.
Definition: Dominators.h:30
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
llvm::IntegerType * IntTy
int