clang 22.0.0git
AMDGPU.cpp
Go to the documentation of this file.
1//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
14#include "CodeGenFunction.h"
18#include "llvm/Analysis/ValueTracking.h"
19#include "llvm/CodeGen/MachineFunction.h"
20#include "llvm/IR/IntrinsicsAMDGPU.h"
21#include "llvm/IR/IntrinsicsR600.h"
22#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
23#include "llvm/Support/AMDGPUAddrSpace.h"
24
25using namespace clang;
26using namespace CodeGen;
27using namespace llvm;
28
29namespace {
30
31// Has second type mangled argument.
32static Value *
34 Intrinsic::ID IntrinsicID,
35 Intrinsic::ID ConstrainedIntrinsicID) {
36 llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
37 llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
38
39 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
40 if (CGF.Builder.getIsFPConstrained()) {
41 Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
42 {Src0->getType(), Src1->getType()});
43 return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1});
44 }
45
46 Function *F =
47 CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()});
48 return CGF.Builder.CreateCall(F, {Src0, Src1});
49}
50
51// If \p E is not null pointer, insert address space cast to match return
52// type of \p E if necessary.
53Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
54 const CallExpr *E = nullptr) {
55 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
56 auto *Call = CGF.Builder.CreateCall(F);
57 Call->addRetAttr(
58 Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
59 Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
60 if (!E)
61 return Call;
62 QualType BuiltinRetType = E->getType();
63 auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
64 if (RetTy == Call->getType())
65 return Call;
66 return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
67}
68
69Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
70 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
71 auto *Call = CGF.Builder.CreateCall(F);
72 Call->addRetAttr(
73 Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
74 Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
75 return Call;
76}
77
78// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
79/// Emit code based on Code Object ABI version.
80/// COV_4 : Emit code to use dispatch ptr
81/// COV_5+ : Emit code to use implicitarg ptr
82/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
83/// and use its value for COV_4 or COV_5+ approach. It is used for
84/// compiling device libraries in an ABI-agnostic way.
85Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
86 llvm::LoadInst *LD;
87
88 auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
89
90 if (Cov == CodeObjectVersionKind::COV_None) {
91 StringRef Name = "__oclc_ABI_version";
92 auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
93 if (!ABIVersionC)
94 ABIVersionC = new llvm::GlobalVariable(
95 CGF.CGM.getModule(), CGF.Int32Ty, false,
96 llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
97 llvm::GlobalVariable::NotThreadLocal,
99
100 // This load will be eliminated by the IPSCCP because it is constant
101 // weak_odr without externally_initialized. Either changing it to weak or
102 // adding externally_initialized will keep the load.
103 Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
104 CGF.CGM.getIntAlign());
105
106 Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
107 ABIVersion,
108 llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5));
109
110 // Indexing the implicit kernarg segment.
111 Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
112 CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
113
114 // Indexing the HSA kernel_dispatch_packet struct.
115 Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
116 CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
117
118 auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
119 LD = CGF.Builder.CreateLoad(
120 Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
121 } else {
122 Value *GEP = nullptr;
123 if (Cov >= CodeObjectVersionKind::COV_5) {
124 // Indexing the implicit kernarg segment.
125 GEP = CGF.Builder.CreateConstGEP1_32(
126 CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
127 } else {
128 // Indexing the HSA kernel_dispatch_packet struct.
129 GEP = CGF.Builder.CreateConstGEP1_32(
130 CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
131 }
132 LD = CGF.Builder.CreateLoad(
134 }
135
136 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
137 llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
138 APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
139 LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
140 LD->setMetadata(llvm::LLVMContext::MD_noundef,
141 llvm::MDNode::get(CGF.getLLVMContext(), {}));
142 LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
143 llvm::MDNode::get(CGF.getLLVMContext(), {}));
144 return LD;
145}
146
147// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
148Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
149 const unsigned XOffset = 12;
150 auto *DP = EmitAMDGPUDispatchPtr(CGF);
151 // Indexing the HSA kernel_dispatch_packet struct.
152 auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
153 auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
154 auto *LD = CGF.Builder.CreateLoad(
156
157 llvm::MDBuilder MDB(CGF.getLLVMContext());
158
159 // Known non-zero.
160 LD->setMetadata(llvm::LLVMContext::MD_range,
161 MDB.createRange(APInt(32, 1), APInt::getZero(32)));
162 LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
163 llvm::MDNode::get(CGF.getLLVMContext(), {}));
164 return LD;
165}
166} // namespace
167
168// Generates the IR for __builtin_read_exec_*.
169// Lowers the builtin to amdgcn_ballot intrinsic.
171 llvm::Type *RegisterType,
172 llvm::Type *ValueType, bool isExecHi) {
173 CodeGen::CGBuilderTy &Builder = CGF.Builder;
174 CodeGen::CodeGenModule &CGM = CGF.CGM;
175
176 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType});
177 llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)});
178
179 if (isExecHi) {
180 Value *Rt2 = Builder.CreateLShr(Call, 32);
181 Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty);
182 return Rt2;
183 }
184
185 return Call;
186}
187
189 llvm::Value *RsrcPtr) {
190 auto &B = CGF.Builder;
191 auto *VecTy = llvm::FixedVectorType::get(B.getInt32Ty(), 8);
192
193 if (RsrcPtr->getType() == VecTy)
194 return RsrcPtr;
195
196 if (RsrcPtr->getType()->isIntegerTy(32)) {
197 llvm::PointerType *VecPtrTy =
198 llvm::PointerType::get(CGF.getLLVMContext(), 8);
199 llvm::Value *Ptr = B.CreateIntToPtr(RsrcPtr, VecPtrTy, "tex.rsrc.from.int");
200 return B.CreateAlignedLoad(VecTy, Ptr, llvm::Align(32), "tex.rsrc.val");
201 }
202
203 if (RsrcPtr->getType()->isPointerTy()) {
204 auto *VecPtrTy = llvm::PointerType::get(
205 CGF.getLLVMContext(), RsrcPtr->getType()->getPointerAddressSpace());
206 llvm::Value *Typed = B.CreateBitCast(RsrcPtr, VecPtrTy, "tex.rsrc.typed");
207 return B.CreateAlignedLoad(VecTy, Typed, llvm::Align(32), "tex.rsrc.val");
208 }
209
210 const auto &DL = CGF.CGM.getDataLayout();
211 if (DL.getTypeSizeInBits(RsrcPtr->getType()) == 256)
212 return B.CreateBitCast(RsrcPtr, VecTy, "tex.rsrc.val");
213
214 llvm::report_fatal_error("Unexpected texture resource argument form");
215}
216
217llvm::CallInst *
219 const clang::CallExpr *E,
220 unsigned IntrinsicID, bool IsImageStore) {
221 auto findTextureDescIndex = [&CGF](const CallExpr *E) -> unsigned {
222 QualType TexQT = CGF.getContext().AMDGPUTextureTy;
223 for (unsigned I = 0, N = E->getNumArgs(); I < N; ++I) {
224 QualType ArgTy = E->getArg(I)->getType();
225 if (ArgTy == TexQT) {
226 return I;
227 }
228
229 if (ArgTy.getCanonicalType() == TexQT.getCanonicalType()) {
230 return I;
231 }
232 }
233
234 return ~0U;
235 };
236
238 unsigned RsrcIndex = findTextureDescIndex(E);
239
240 if (RsrcIndex == ~0U) {
241 llvm::report_fatal_error("Invalid argument count for image builtin");
242 }
243
244 for (unsigned I = 0; I < E->getNumArgs(); ++I) {
245 llvm::Value *V = CGF.EmitScalarExpr(E->getArg(I));
246 if (I == RsrcIndex)
248 Args.push_back(V);
249 }
250
251 llvm::Type *RetTy = IsImageStore ? CGF.VoidTy : CGF.ConvertType(E->getType());
252 llvm::CallInst *Call = CGF.Builder.CreateIntrinsic(RetTy, IntrinsicID, Args);
253 return Call;
254}
255
256// Emit an intrinsic that has 1 float or double operand, and 1 integer.
258 const CallExpr *E,
259 unsigned IntrinsicID) {
260 llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
261 llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
262
263 Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
264 return CGF.Builder.CreateCall(F, {Src0, Src1});
265}
266
267static inline StringRef mapScopeToSPIRV(StringRef AMDGCNScope) {
268 if (AMDGCNScope == "agent")
269 return "device";
270 if (AMDGCNScope == "wavefront")
271 return "subgroup";
272 return AMDGCNScope;
273}
274
275// For processing memory ordering and memory scope arguments of various
276// amdgcn builtins.
277// \p Order takes a C++11 compatible memory-ordering specifier and converts
278// it into LLVM's memory ordering specifier using atomic C ABI, and writes
279// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
280// specific SyncScopeID and writes it to \p SSID.
282 llvm::AtomicOrdering &AO,
283 llvm::SyncScope::ID &SSID) {
284 int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
285
286 // Map C11/C++11 memory ordering to LLVM memory ordering
287 assert(llvm::isValidAtomicOrderingCABI(ord));
288 switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
289 case llvm::AtomicOrderingCABI::acquire:
290 case llvm::AtomicOrderingCABI::consume:
291 AO = llvm::AtomicOrdering::Acquire;
292 break;
293 case llvm::AtomicOrderingCABI::release:
294 AO = llvm::AtomicOrdering::Release;
295 break;
296 case llvm::AtomicOrderingCABI::acq_rel:
297 AO = llvm::AtomicOrdering::AcquireRelease;
298 break;
299 case llvm::AtomicOrderingCABI::seq_cst:
300 AO = llvm::AtomicOrdering::SequentiallyConsistent;
301 break;
302 case llvm::AtomicOrderingCABI::relaxed:
303 AO = llvm::AtomicOrdering::Monotonic;
304 break;
305 }
306
307 // Some of the atomic builtins take the scope as a string name.
308 StringRef scp;
309 if (llvm::getConstantStringInfo(Scope, scp)) {
310 if (getTarget().getTriple().isSPIRV())
311 scp = mapScopeToSPIRV(scp);
312 SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
313 return;
314 }
315
316 // Older builtins had an enum argument for the memory scope.
317 const char *SSN = nullptr;
318 int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
319 switch (scope) {
320 case AtomicScopeGenericModel::System: // __MEMORY_SCOPE_SYSTEM
321 SSID = llvm::SyncScope::System;
322 break;
323 case AtomicScopeGenericModel::Device: // __MEMORY_SCOPE_DEVICE
324 SSN = getTarget().getTriple().isSPIRV() ? "device" : "agent";
325 break;
326 case AtomicScopeGenericModel::Workgroup: // __MEMORY_SCOPE_WRKGRP
327 SSN = "workgroup";
328 break;
329 case AtomicScopeGenericModel::Cluster: // __MEMORY_SCOPE_CLUSTR
330 SSN = getTarget().getTriple().isSPIRV() ? "workgroup" : "cluster";
331 break;
332 case AtomicScopeGenericModel::Wavefront: // __MEMORY_SCOPE_WVFRNT
333 SSN = getTarget().getTriple().isSPIRV() ? "subgroup" : "wavefront";
334 break;
335 case AtomicScopeGenericModel::Single: // __MEMORY_SCOPE_SINGLE
336 SSID = llvm::SyncScope::SingleThread;
337 break;
338 default:
339 SSID = llvm::SyncScope::System;
340 break;
341 }
342 if (SSN)
343 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN);
344}
345
346llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
347 unsigned Idx,
348 const CallExpr *E) {
349 llvm::Value *Arg = nullptr;
350 if ((ICEArguments & (1 << Idx)) == 0) {
351 Arg = EmitScalarExpr(E->getArg(Idx));
352 } else {
353 // If this is required to be a constant, constant fold it so that we
354 // know that the generated intrinsic gets a ConstantInt.
355 std::optional<llvm::APSInt> Result =
357 assert(Result && "Expected argument to be a constant");
358 Arg = llvm::ConstantInt::get(getLLVMContext(), *Result);
359 }
360 return Arg;
361}
362
364 const CallExpr *E) {
365 constexpr const char *Tag = "amdgpu-synchronize-as";
366
367 LLVMContext &Ctx = Inst->getContext();
369 for (unsigned K = 2; K < E->getNumArgs(); ++K) {
370 llvm::Value *V = EmitScalarExpr(E->getArg(K));
371 StringRef AS;
372 if (llvm::getConstantStringInfo(V, AS)) {
373 MMRAs.push_back({Tag, AS});
374 // TODO: Delete the resulting unused constant?
375 continue;
376 }
377 CGM.Error(E->getExprLoc(),
378 "expected an address space name as a string literal");
379 }
380
381 llvm::sort(MMRAs);
382 MMRAs.erase(llvm::unique(MMRAs), MMRAs.end());
383 Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
384}
385
386static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
387 switch (BuiltinID) {
388 default:
389 llvm_unreachable("Unknown BuiltinID for wave reduction");
390 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
391 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
392 return Intrinsic::amdgcn_wave_reduce_add;
393 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
394 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
395 return Intrinsic::amdgcn_wave_reduce_sub;
396 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
397 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
398 return Intrinsic::amdgcn_wave_reduce_min;
399 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
400 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
401 return Intrinsic::amdgcn_wave_reduce_umin;
402 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
403 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
404 return Intrinsic::amdgcn_wave_reduce_max;
405 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
406 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
407 return Intrinsic::amdgcn_wave_reduce_umax;
408 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
409 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
410 return Intrinsic::amdgcn_wave_reduce_and;
411 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
412 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
413 return Intrinsic::amdgcn_wave_reduce_or;
414 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
415 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64:
416 return Intrinsic::amdgcn_wave_reduce_xor;
417 }
418}
419
421 const CallExpr *E) {
422 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
423 llvm::SyncScope::ID SSID;
424 switch (BuiltinID) {
425 case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
426 case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
427 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
428 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
429 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
430 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
431 case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
432 case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
433 case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
434 case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
435 case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
436 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
437 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
438 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
439 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
440 case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
441 case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
442 case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: {
443 Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
444 llvm::Value *Value = EmitScalarExpr(E->getArg(0));
445 llvm::Value *Strategy = EmitScalarExpr(E->getArg(1));
446 llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
447 return Builder.CreateCall(F, {Value, Strategy});
448 }
449 case AMDGPU::BI__builtin_amdgcn_div_scale:
450 case AMDGPU::BI__builtin_amdgcn_div_scalef: {
451 // Translate from the intrinsics's struct return to the builtin's out
452 // argument.
453
454 Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
455
456 llvm::Value *X = EmitScalarExpr(E->getArg(0));
457 llvm::Value *Y = EmitScalarExpr(E->getArg(1));
458 llvm::Value *Z = EmitScalarExpr(E->getArg(2));
459
460 llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
461 X->getType());
462
463 llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
464
465 llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
466 llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
467
468 llvm::Type *RealFlagType = FlagOutPtr.getElementType();
469
470 llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
471 Builder.CreateStore(FlagExt, FlagOutPtr);
472 return Result;
473 }
474 case AMDGPU::BI__builtin_amdgcn_div_fmas:
475 case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
476 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
477 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
478 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
479 llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
480
481 llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
482 Src0->getType());
483 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
484 return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
485 }
486
487 case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
489 Intrinsic::amdgcn_ds_swizzle);
490 case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
491 case AMDGPU::BI__builtin_amdgcn_mov_dpp:
492 case AMDGPU::BI__builtin_amdgcn_update_dpp: {
494 // Find out if any arguments are required to be integer constant
495 // expressions.
496 unsigned ICEArguments = 0;
498 getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
499 assert(Error == ASTContext::GE_None && "Should not codegen an error");
500 llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
501 unsigned Size = DataTy->getPrimitiveSizeInBits();
502 llvm::Type *IntTy =
503 llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
504 Function *F =
505 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
506 ? Intrinsic::amdgcn_mov_dpp8
507 : Intrinsic::amdgcn_update_dpp,
508 IntTy);
509 assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
510 E->getNumArgs() == 2);
511 bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
512 if (InsertOld)
513 Args.push_back(llvm::PoisonValue::get(IntTy));
514 for (unsigned I = 0; I != E->getNumArgs(); ++I) {
515 llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
516 if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
517 Size < 32) {
518 if (!DataTy->isIntegerTy())
519 V = Builder.CreateBitCast(
520 V, llvm::IntegerType::get(Builder.getContext(), Size));
521 V = Builder.CreateZExtOrBitCast(V, IntTy);
522 }
523 llvm::Type *ExpTy =
524 F->getFunctionType()->getFunctionParamType(I + InsertOld);
525 Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy));
526 }
527 Value *V = Builder.CreateCall(F, Args);
528 if (Size < 32 && !DataTy->isIntegerTy())
529 V = Builder.CreateTrunc(
530 V, llvm::IntegerType::get(Builder.getContext(), Size));
531 return Builder.CreateTruncOrBitCast(V, DataTy);
532 }
533 case AMDGPU::BI__builtin_amdgcn_permlane16:
534 case AMDGPU::BI__builtin_amdgcn_permlanex16:
536 *this, E,
537 BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
538 ? Intrinsic::amdgcn_permlane16
539 : Intrinsic::amdgcn_permlanex16);
540 case AMDGPU::BI__builtin_amdgcn_permlane64:
542 Intrinsic::amdgcn_permlane64);
543 case AMDGPU::BI__builtin_amdgcn_readlane:
545 Intrinsic::amdgcn_readlane);
546 case AMDGPU::BI__builtin_amdgcn_readfirstlane:
548 Intrinsic::amdgcn_readfirstlane);
549 case AMDGPU::BI__builtin_amdgcn_div_fixup:
550 case AMDGPU::BI__builtin_amdgcn_div_fixupf:
551 case AMDGPU::BI__builtin_amdgcn_div_fixuph:
553 Intrinsic::amdgcn_div_fixup);
554 case AMDGPU::BI__builtin_amdgcn_trig_preop:
555 case AMDGPU::BI__builtin_amdgcn_trig_preopf:
556 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
557 case AMDGPU::BI__builtin_amdgcn_rcp:
558 case AMDGPU::BI__builtin_amdgcn_rcpf:
559 case AMDGPU::BI__builtin_amdgcn_rcph:
560 case AMDGPU::BI__builtin_amdgcn_rcp_bf16:
561 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rcp);
562 case AMDGPU::BI__builtin_amdgcn_sqrt:
563 case AMDGPU::BI__builtin_amdgcn_sqrtf:
564 case AMDGPU::BI__builtin_amdgcn_sqrth:
565 case AMDGPU::BI__builtin_amdgcn_sqrt_bf16:
567 Intrinsic::amdgcn_sqrt);
568 case AMDGPU::BI__builtin_amdgcn_rsq:
569 case AMDGPU::BI__builtin_amdgcn_rsqf:
570 case AMDGPU::BI__builtin_amdgcn_rsqh:
571 case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
572 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq);
573 case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
574 case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
576 Intrinsic::amdgcn_rsq_clamp);
577 case AMDGPU::BI__builtin_amdgcn_sinf:
578 case AMDGPU::BI__builtin_amdgcn_sinh:
579 case AMDGPU::BI__builtin_amdgcn_sin_bf16:
580 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin);
581 case AMDGPU::BI__builtin_amdgcn_cosf:
582 case AMDGPU::BI__builtin_amdgcn_cosh:
583 case AMDGPU::BI__builtin_amdgcn_cos_bf16:
584 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos);
585 case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
586 return EmitAMDGPUDispatchPtr(*this, E);
587 case AMDGPU::BI__builtin_amdgcn_logf:
588 case AMDGPU::BI__builtin_amdgcn_log_bf16:
589 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
590 case AMDGPU::BI__builtin_amdgcn_exp2f:
591 case AMDGPU::BI__builtin_amdgcn_exp2_bf16:
593 Intrinsic::amdgcn_exp2);
594 case AMDGPU::BI__builtin_amdgcn_log_clampf:
596 Intrinsic::amdgcn_log_clamp);
597 case AMDGPU::BI__builtin_amdgcn_ldexp:
598 case AMDGPU::BI__builtin_amdgcn_ldexpf: {
599 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
600 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
601 llvm::Function *F =
602 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
603 return Builder.CreateCall(F, {Src0, Src1});
604 }
605 case AMDGPU::BI__builtin_amdgcn_ldexph: {
606 // The raw instruction has a different behavior for out of bounds exponent
607 // values (implicit truncation instead of saturate to short_min/short_max).
608 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
609 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
610 llvm::Function *F =
611 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
612 return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)});
613 }
614 case AMDGPU::BI__builtin_amdgcn_frexp_mant:
615 case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
616 case AMDGPU::BI__builtin_amdgcn_frexp_manth:
618 Intrinsic::amdgcn_frexp_mant);
619 case AMDGPU::BI__builtin_amdgcn_frexp_exp:
620 case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
621 Value *Src0 = EmitScalarExpr(E->getArg(0));
622 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
623 { Builder.getInt32Ty(), Src0->getType() });
624 return Builder.CreateCall(F, Src0);
625 }
626 case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
627 Value *Src0 = EmitScalarExpr(E->getArg(0));
628 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
629 { Builder.getInt16Ty(), Src0->getType() });
630 return Builder.CreateCall(F, Src0);
631 }
632 case AMDGPU::BI__builtin_amdgcn_fract:
633 case AMDGPU::BI__builtin_amdgcn_fractf:
634 case AMDGPU::BI__builtin_amdgcn_fracth:
636 Intrinsic::amdgcn_fract);
637 case AMDGPU::BI__builtin_amdgcn_lerp:
639 Intrinsic::amdgcn_lerp);
640 case AMDGPU::BI__builtin_amdgcn_ubfe:
642 Intrinsic::amdgcn_ubfe);
643 case AMDGPU::BI__builtin_amdgcn_sbfe:
645 Intrinsic::amdgcn_sbfe);
646 case AMDGPU::BI__builtin_amdgcn_ballot_w32:
647 case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
648 llvm::Type *ResultType = ConvertType(E->getType());
649 llvm::Value *Src = EmitScalarExpr(E->getArg(0));
650 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {ResultType});
651 return Builder.CreateCall(F, {Src});
652 }
653 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
654 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
655 llvm::Value *Src = EmitScalarExpr(E->getArg(0));
656 Function *F =
657 CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, {Src->getType()});
658 return Builder.CreateCall(F, {Src});
659 }
660 case AMDGPU::BI__builtin_amdgcn_tanhf:
661 case AMDGPU::BI__builtin_amdgcn_tanhh:
662 case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
664 Intrinsic::amdgcn_tanh);
665 case AMDGPU::BI__builtin_amdgcn_uicmp:
666 case AMDGPU::BI__builtin_amdgcn_uicmpl:
667 case AMDGPU::BI__builtin_amdgcn_sicmp:
668 case AMDGPU::BI__builtin_amdgcn_sicmpl: {
669 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
670 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
671 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
672
673 // FIXME-GFX10: How should 32 bit mask be handled?
674 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
675 { Builder.getInt64Ty(), Src0->getType() });
676 return Builder.CreateCall(F, { Src0, Src1, Src2 });
677 }
678 case AMDGPU::BI__builtin_amdgcn_fcmp:
679 case AMDGPU::BI__builtin_amdgcn_fcmpf: {
680 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
681 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
682 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
683
684 // FIXME-GFX10: How should 32 bit mask be handled?
685 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
686 { Builder.getInt64Ty(), Src0->getType() });
687 return Builder.CreateCall(F, { Src0, Src1, Src2 });
688 }
689 case AMDGPU::BI__builtin_amdgcn_class:
690 case AMDGPU::BI__builtin_amdgcn_classf:
691 case AMDGPU::BI__builtin_amdgcn_classh:
692 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
693 case AMDGPU::BI__builtin_amdgcn_fmed3f:
694 case AMDGPU::BI__builtin_amdgcn_fmed3h:
696 Intrinsic::amdgcn_fmed3);
697 case AMDGPU::BI__builtin_amdgcn_ds_append:
698 case AMDGPU::BI__builtin_amdgcn_ds_consume: {
699 Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
700 Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
701 Value *Src0 = EmitScalarExpr(E->getArg(0));
702 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
703 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
704 }
705 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
706 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
707 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
708 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
709 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
710 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
711 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
712 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
713 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
714 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
715 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
716 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
717 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
718 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
719 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
720 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
721 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
722 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
723 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
724 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
725 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
726 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
727 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
728 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
729 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
730 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
731 Intrinsic::ID IID;
732 switch (BuiltinID) {
733 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
734 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
735 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
736 IID = Intrinsic::amdgcn_global_load_tr_b64;
737 break;
738 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
739 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
740 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
741 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
742 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
743 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
744 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
745 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
746 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
747 IID = Intrinsic::amdgcn_global_load_tr_b128;
748 break;
749 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
750 IID = Intrinsic::amdgcn_global_load_tr4_b64;
751 break;
752 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
753 IID = Intrinsic::amdgcn_global_load_tr6_b96;
754 break;
755 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
756 IID = Intrinsic::amdgcn_ds_load_tr4_b64;
757 break;
758 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
759 IID = Intrinsic::amdgcn_ds_load_tr6_b96;
760 break;
761 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
762 IID = Intrinsic::amdgcn_ds_load_tr8_b64;
763 break;
764 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
765 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
766 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
767 IID = Intrinsic::amdgcn_ds_load_tr16_b128;
768 break;
769 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
770 IID = Intrinsic::amdgcn_ds_read_tr4_b64;
771 break;
772 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
773 IID = Intrinsic::amdgcn_ds_read_tr8_b64;
774 break;
775 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
776 IID = Intrinsic::amdgcn_ds_read_tr6_b96;
777 break;
778 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
779 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
780 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
781 IID = Intrinsic::amdgcn_ds_read_tr16_b64;
782 break;
783 }
784 llvm::Type *LoadTy = ConvertType(E->getType());
785 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
786 llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
787 return Builder.CreateCall(F, {Addr});
788 }
789 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
790 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
791 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
792 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
793 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
794 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128: {
795
796 Intrinsic::ID IID;
797 switch (BuiltinID) {
798 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
799 IID = Intrinsic::amdgcn_global_load_monitor_b32;
800 break;
801 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
802 IID = Intrinsic::amdgcn_global_load_monitor_b64;
803 break;
804 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
805 IID = Intrinsic::amdgcn_global_load_monitor_b128;
806 break;
807 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
808 IID = Intrinsic::amdgcn_flat_load_monitor_b32;
809 break;
810 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
811 IID = Intrinsic::amdgcn_flat_load_monitor_b64;
812 break;
813 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128:
814 IID = Intrinsic::amdgcn_flat_load_monitor_b128;
815 break;
816 }
817
818 llvm::Type *LoadTy = ConvertType(E->getType());
819 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
820 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
821 llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
822 return Builder.CreateCall(F, {Addr, Val});
823 }
824 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
825 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
826 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: {
827 Intrinsic::ID IID;
828 switch (BuiltinID) {
829 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
830 IID = Intrinsic::amdgcn_cluster_load_b32;
831 break;
832 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
833 IID = Intrinsic::amdgcn_cluster_load_b64;
834 break;
835 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128:
836 IID = Intrinsic::amdgcn_cluster_load_b128;
837 break;
838 }
840 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
841 Args.push_back(EmitScalarExpr(E->getArg(i)));
842 llvm::Function *F = CGM.getIntrinsic(IID, {ConvertType(E->getType())});
843 return Builder.CreateCall(F, {Args});
844 }
845 case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
846 // Should this have asan instrumentation?
848 Intrinsic::amdgcn_load_to_lds);
849 }
850 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
851 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
852 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
853 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
854 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
855 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B: {
856 Intrinsic::ID IID;
857 switch (BuiltinID) {
858 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
859 IID = Intrinsic::amdgcn_cooperative_atomic_load_32x4B;
860 break;
861 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
862 IID = Intrinsic::amdgcn_cooperative_atomic_store_32x4B;
863 break;
864 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
865 IID = Intrinsic::amdgcn_cooperative_atomic_load_16x8B;
866 break;
867 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
868 IID = Intrinsic::amdgcn_cooperative_atomic_store_16x8B;
869 break;
870 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
871 IID = Intrinsic::amdgcn_cooperative_atomic_load_8x16B;
872 break;
873 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B:
874 IID = Intrinsic::amdgcn_cooperative_atomic_store_8x16B;
875 break;
876 }
877
878 LLVMContext &Ctx = CGM.getLLVMContext();
880 // last argument is a MD string
881 const unsigned ScopeArg = E->getNumArgs() - 1;
882 for (unsigned i = 0; i != ScopeArg; ++i)
883 Args.push_back(EmitScalarExpr(E->getArg(i)));
884 StringRef Arg = cast<StringLiteral>(E->getArg(ScopeArg)->IgnoreParenCasts())
885 ->getString();
886 llvm::MDNode *MD = llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, Arg)});
887 Args.push_back(llvm::MetadataAsValue::get(Ctx, MD));
888 // Intrinsic is typed based on the pointer AS. Pointer is always the first
889 // argument.
890 llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
891 return Builder.CreateCall(F, {Args});
892 }
893 case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
894 Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
895 {llvm::Type::getInt64Ty(getLLVMContext())});
896 return Builder.CreateCall(F);
897 }
898 case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
899 Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv,
900 {llvm::Type::getInt64Ty(getLLVMContext())});
901 llvm::Value *Env = EmitScalarExpr(E->getArg(0));
902 return Builder.CreateCall(F, {Env});
903 }
904 case AMDGPU::BI__builtin_amdgcn_read_exec:
905 return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
906 case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
907 return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
908 case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
909 return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
910 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
911 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
912 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
913 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
914 llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
915 llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
916 llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2));
917 llvm::Value *RayDir = EmitScalarExpr(E->getArg(3));
918 llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
919 llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));
920
921 // The builtins take these arguments as vec4 where the last element is
922 // ignored. The intrinsic takes them as vec3.
923 RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
924 {0, 1, 2});
925 RayDir =
926 Builder.CreateShuffleVector(RayDir, RayDir, {0, 1, 2});
927 RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
928 {0, 1, 2});
929
930 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
931 {NodePtr->getType(), RayDir->getType()});
932 return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
933 RayInverseDir, TextureDescr});
934 }
935 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
936 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
937 Intrinsic::ID IID;
938 switch (BuiltinID) {
939 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
940 IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
941 break;
942 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
943 IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
944 break;
945 }
946 llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
947 llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
948 llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2));
949 llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3));
950 llvm::Value *RayDir = EmitScalarExpr(E->getArg(4));
951 llvm::Value *Offset = EmitScalarExpr(E->getArg(5));
952 llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6));
953
954 Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7));
955 Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8));
956
957 llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
958
959 llvm::CallInst *CI = Builder.CreateCall(
960 IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
961 Offset, TextureDescr});
962
963 llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0);
964 llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1);
965 llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2);
966
967 Builder.CreateStore(RetRayOrigin, RetRayOriginPtr);
968 Builder.CreateStore(RetRayDir, RetRayDirPtr);
969
970 return RetVData;
971 }
972
973 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
974 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
975 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
976 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
977 Intrinsic::ID IID;
978 switch (BuiltinID) {
979 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
980 IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
981 break;
982 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
983 IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
984 break;
985 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
986 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
987 break;
988 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
989 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
990 break;
991 }
992
994 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
995 Args.push_back(EmitScalarExpr(E->getArg(i)));
996
997 Function *F = CGM.getIntrinsic(IID);
998 Value *Call = Builder.CreateCall(F, Args);
999 Value *Rtn = Builder.CreateExtractValue(Call, 0);
1000 Value *A = Builder.CreateExtractValue(Call, 1);
1001 llvm::Type *RetTy = ConvertType(E->getType());
1002 Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
1003 (uint64_t)0);
1004 // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
1005 // <2 x i64>, zext the second value.
1006 if (A->getType()->getPrimitiveSizeInBits() <
1007 RetTy->getScalarType()->getPrimitiveSizeInBits())
1008 A = Builder.CreateZExt(A, RetTy->getScalarType());
1009
1010 return Builder.CreateInsertElement(I0, A, 1);
1011 }
1012 case AMDGPU::BI__builtin_amdgcn_image_load_1d_v4f32_i32:
1013 case AMDGPU::BI__builtin_amdgcn_image_load_1d_v4f16_i32:
1015 *this, E, Intrinsic::amdgcn_image_load_1d, false);
1016 case AMDGPU::BI__builtin_amdgcn_image_load_1darray_v4f32_i32:
1017 case AMDGPU::BI__builtin_amdgcn_image_load_1darray_v4f16_i32:
1019 *this, E, Intrinsic::amdgcn_image_load_1darray, false);
1020 case AMDGPU::BI__builtin_amdgcn_image_load_2d_f32_i32:
1021 case AMDGPU::BI__builtin_amdgcn_image_load_2d_v4f32_i32:
1022 case AMDGPU::BI__builtin_amdgcn_image_load_2d_v4f16_i32:
1024 *this, E, Intrinsic::amdgcn_image_load_2d, false);
1025 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_f32_i32:
1026 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_v4f32_i32:
1027 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_v4f16_i32:
1029 *this, E, Intrinsic::amdgcn_image_load_2darray, false);
1030 case AMDGPU::BI__builtin_amdgcn_image_load_3d_v4f32_i32:
1031 case AMDGPU::BI__builtin_amdgcn_image_load_3d_v4f16_i32:
1033 *this, E, Intrinsic::amdgcn_image_load_3d, false);
1034 case AMDGPU::BI__builtin_amdgcn_image_load_cube_v4f32_i32:
1035 case AMDGPU::BI__builtin_amdgcn_image_load_cube_v4f16_i32:
1037 *this, E, Intrinsic::amdgcn_image_load_cube, false);
1038 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1d_v4f32_i32:
1039 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1d_v4f16_i32:
1041 *this, E, Intrinsic::amdgcn_image_load_mip_1d, false);
1042 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1darray_v4f32_i32:
1043 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1darray_v4f16_i32:
1045 *this, E, Intrinsic::amdgcn_image_load_mip_1darray, false);
1046 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_f32_i32:
1047 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_v4f32_i32:
1048 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_v4f16_i32:
1050 *this, E, Intrinsic::amdgcn_image_load_mip_2d, false);
1051 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_f32_i32:
1052 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_v4f32_i32:
1053 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_v4f16_i32:
1055 *this, E, Intrinsic::amdgcn_image_load_mip_2darray, false);
1056 case AMDGPU::BI__builtin_amdgcn_image_load_mip_3d_v4f32_i32:
1057 case AMDGPU::BI__builtin_amdgcn_image_load_mip_3d_v4f16_i32:
1059 *this, E, Intrinsic::amdgcn_image_load_mip_3d, false);
1060 case AMDGPU::BI__builtin_amdgcn_image_load_mip_cube_v4f32_i32:
1061 case AMDGPU::BI__builtin_amdgcn_image_load_mip_cube_v4f16_i32:
1063 *this, E, Intrinsic::amdgcn_image_load_mip_cube, false);
1064 case AMDGPU::BI__builtin_amdgcn_image_store_1d_v4f32_i32:
1065 case AMDGPU::BI__builtin_amdgcn_image_store_1d_v4f16_i32:
1067 *this, E, Intrinsic::amdgcn_image_store_1d, true);
1068 case AMDGPU::BI__builtin_amdgcn_image_store_1darray_v4f32_i32:
1069 case AMDGPU::BI__builtin_amdgcn_image_store_1darray_v4f16_i32:
1071 *this, E, Intrinsic::amdgcn_image_store_1darray, true);
1072 case AMDGPU::BI__builtin_amdgcn_image_store_2d_f32_i32:
1073 case AMDGPU::BI__builtin_amdgcn_image_store_2d_v4f32_i32:
1074 case AMDGPU::BI__builtin_amdgcn_image_store_2d_v4f16_i32:
1076 *this, E, Intrinsic::amdgcn_image_store_2d, true);
1077 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_f32_i32:
1078 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_v4f32_i32:
1079 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_v4f16_i32:
1081 *this, E, Intrinsic::amdgcn_image_store_2darray, true);
1082 case AMDGPU::BI__builtin_amdgcn_image_store_3d_v4f32_i32:
1083 case AMDGPU::BI__builtin_amdgcn_image_store_3d_v4f16_i32:
1085 *this, E, Intrinsic::amdgcn_image_store_3d, true);
1086 case AMDGPU::BI__builtin_amdgcn_image_store_cube_v4f32_i32:
1087 case AMDGPU::BI__builtin_amdgcn_image_store_cube_v4f16_i32:
1089 *this, E, Intrinsic::amdgcn_image_store_cube, true);
1090 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1d_v4f32_i32:
1091 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1d_v4f16_i32:
1093 *this, E, Intrinsic::amdgcn_image_store_mip_1d, true);
1094 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1darray_v4f32_i32:
1095 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1darray_v4f16_i32:
1097 *this, E, Intrinsic::amdgcn_image_store_mip_1darray, true);
1098 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_f32_i32:
1099 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_v4f32_i32:
1100 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_v4f16_i32:
1102 *this, E, Intrinsic::amdgcn_image_store_mip_2d, true);
1103 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_f32_i32:
1104 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_v4f32_i32:
1105 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_v4f16_i32:
1107 *this, E, Intrinsic::amdgcn_image_store_mip_2darray, true);
1108 case AMDGPU::BI__builtin_amdgcn_image_store_mip_3d_v4f32_i32:
1109 case AMDGPU::BI__builtin_amdgcn_image_store_mip_3d_v4f16_i32:
1111 *this, E, Intrinsic::amdgcn_image_store_mip_3d, true);
1112 case AMDGPU::BI__builtin_amdgcn_image_store_mip_cube_v4f32_i32:
1113 case AMDGPU::BI__builtin_amdgcn_image_store_mip_cube_v4f16_i32:
1115 *this, E, Intrinsic::amdgcn_image_store_mip_cube, true);
1116 case AMDGPU::BI__builtin_amdgcn_image_sample_1d_v4f32_f32:
1117 case AMDGPU::BI__builtin_amdgcn_image_sample_1d_v4f16_f32:
1119 *this, E, Intrinsic::amdgcn_image_sample_1d, false);
1120 case AMDGPU::BI__builtin_amdgcn_image_sample_1darray_v4f32_f32:
1121 case AMDGPU::BI__builtin_amdgcn_image_sample_1darray_v4f16_f32:
1123 *this, E, Intrinsic::amdgcn_image_sample_1darray, false);
1124 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_f32_f32:
1125 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_v4f32_f32:
1126 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_v4f16_f32:
1128 *this, E, Intrinsic::amdgcn_image_sample_2d, false);
1129 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_f32_f32:
1130 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_v4f32_f32:
1131 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_v4f16_f32:
1133 *this, E, Intrinsic::amdgcn_image_sample_2darray, false);
1134 case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f32_f32:
1135 case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f16_f32:
1137 *this, E, Intrinsic::amdgcn_image_sample_3d, false);
1138 case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f32_f32:
1139 case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f16_f32:
1141 *this, E, Intrinsic::amdgcn_image_sample_cube, false);
1142 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f32_f32:
1143 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f16_f32:
1145 *this, E, Intrinsic::amdgcn_image_sample_lz_1d, false);
1146 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f32_f32:
1147 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f16_f32:
1149 *this, E, Intrinsic::amdgcn_image_sample_l_1d, false);
1150 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f32_f32:
1151 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f16_f32:
1153 *this, E, Intrinsic::amdgcn_image_sample_d_1d, false);
1154 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f32_f32:
1155 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f16_f32:
1156 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_f32_f32:
1158 *this, E, Intrinsic::amdgcn_image_sample_lz_2d, false);
1159 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f32_f32:
1160 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f16_f32:
1161 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_f32_f32:
1163 *this, E, Intrinsic::amdgcn_image_sample_l_2d, false);
1164 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f32_f32:
1165 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f16_f32:
1166 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_f32_f32:
1168 *this, E, Intrinsic::amdgcn_image_sample_d_2d, false);
1169 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f32_f32:
1170 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f16_f32:
1172 *this, E, Intrinsic::amdgcn_image_sample_lz_3d, false);
1173 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f32_f32:
1174 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f16_f32:
1176 *this, E, Intrinsic::amdgcn_image_sample_l_3d, false);
1177 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f32_f32:
1178 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f16_f32:
1180 *this, E, Intrinsic::amdgcn_image_sample_d_3d, false);
1181 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f32_f32:
1182 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f16_f32:
1184 *this, E, Intrinsic::amdgcn_image_sample_lz_cube, false);
1185 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f32_f32:
1186 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f16_f32:
1188 *this, E, Intrinsic::amdgcn_image_sample_l_cube, false);
1189 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32:
1190 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32:
1192 *this, E, Intrinsic::amdgcn_image_sample_lz_1darray, false);
1193 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f32_f32:
1194 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f16_f32:
1196 *this, E, Intrinsic::amdgcn_image_sample_l_1darray, false);
1197 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f32_f32:
1198 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f16_f32:
1200 *this, E, Intrinsic::amdgcn_image_sample_d_1darray, false);
1201 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32:
1202 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32:
1203 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_f32_f32:
1205 *this, E, Intrinsic::amdgcn_image_sample_lz_2darray, false);
1206 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f32_f32:
1207 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f16_f32:
1208 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_f32_f32:
1210 *this, E, Intrinsic::amdgcn_image_sample_l_2darray, false);
1211 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f32_f32:
1212 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f16_f32:
1213 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_f32_f32:
1215 *this, E, Intrinsic::amdgcn_image_sample_d_2darray, false);
1216 case clang::AMDGPU::BI__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32:
1218 *this, E, Intrinsic::amdgcn_image_gather4_lz_2d, false);
1219 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1220 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1221 llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8);
1222 Function *F = CGM.getIntrinsic(
1223 BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
1224 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
1225 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
1226 {VT, VT});
1227
1229 for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
1230 Args.push_back(EmitScalarExpr(E->getArg(I)));
1231 return Builder.CreateCall(F, Args);
1232 }
1233 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1234 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1235 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
1236 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
1237 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1238 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1239 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
1240 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
1241 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1242 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1243 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
1244 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
1245 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1246 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1247 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1248 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1249 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
1250 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
1251 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
1252 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
1253 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
1254 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
1255 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
1256 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
1257 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1258 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
1259 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1260 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
1261 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1262 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
1263 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1264 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
1265 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1266 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
1267 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1268 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
1269 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1270 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
1271 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1272 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
1273 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1274 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
1275 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1276 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
1277 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1278 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1279 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1280 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1281 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1282 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1283 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1284 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1285 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1286 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1287 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1288 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1289 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1290 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1291 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1292 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1293 // GFX1250 WMMA builtins
1294 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1295 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1296 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1297 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1298 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1299 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1300 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1301 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1302 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1303 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1304 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1305 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1306 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1307 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1308 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1309 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1310 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1311 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1312 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1313 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1314 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1315 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1316 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1317 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1318 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1319 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1320 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1321 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1322 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1323 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1324 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1325 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1326 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1327 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1328 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1329 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1330 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1331 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1332 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1333 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1334 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1335 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1336 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
1337
1338 // These operations perform a matrix multiplication and accumulation of
1339 // the form:
1340 // D = A * B + C
1341 // We need to specify one type for matrices AB and one for matrices CD.
1342 // Sparse matrix operations can have different types for A and B as well as
1343 // an additional type for sparsity index.
1344 // Destination type should be put before types used for source operands.
1345 SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
1346 // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
1347 // There is no need for the variable opsel argument, so always set it to
1348 // "false".
1349 bool AppendFalseForOpselArg = false;
1350 unsigned BuiltinWMMAOp;
1351 // Need return type when D and C are of different types.
1352 bool NeedReturnType = false;
1353
1354 switch (BuiltinID) {
1355 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
1356 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
1357 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
1358 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
1359 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1360 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
1361 break;
1362 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1363 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1364 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
1365 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
1366 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1367 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
1368 break;
1369 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
1370 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
1371 AppendFalseForOpselArg = true;
1372 [[fallthrough]];
1373 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1374 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
1375 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1376 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
1377 break;
1378 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
1379 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
1380 AppendFalseForOpselArg = true;
1381 [[fallthrough]];
1382 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1383 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
1384 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1385 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
1386 break;
1387 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1388 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
1389 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1390 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
1391 break;
1392 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1393 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
1394 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1395 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
1396 break;
1397 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1398 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1399 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1400 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
1401 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1402 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
1403 break;
1404 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1405 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1406 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1407 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
1408 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1409 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
1410 break;
1411 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1412 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
1413 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1414 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
1415 break;
1416 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1417 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
1418 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1419 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
1420 break;
1421 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1422 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
1423 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1424 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
1425 break;
1426 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1427 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
1428 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1429 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
1430 break;
1431 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1432 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
1433 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1434 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
1435 break;
1436 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1437 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
1438 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1439 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
1440 break;
1441 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1442 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
1443 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1444 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
1445 break;
1446 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1447 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
1448 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1449 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
1450 break;
1451 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1452 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1453 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1454 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
1455 break;
1456 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1457 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1458 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1459 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
1460 break;
1461 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1462 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1463 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1464 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
1465 break;
1466 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1467 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1468 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1469 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
1470 break;
1471 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1472 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1473 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1474 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
1475 break;
1476 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1477 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1478 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1479 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
1480 break;
1481 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1482 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1483 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1484 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
1485 break;
1486 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1487 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1488 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1489 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
1490 break;
1491 // GFX1250 WMMA builtins
1492 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1493 ArgsForMatchingMatrixTypes = {5, 1};
1494 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
1495 break;
1496 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1497 ArgsForMatchingMatrixTypes = {5, 1};
1498 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
1499 break;
1500 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1501 ArgsForMatchingMatrixTypes = {5, 1};
1502 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
1503 break;
1504 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1505 ArgsForMatchingMatrixTypes = {5, 1};
1506 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
1507 break;
1508 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1509 ArgsForMatchingMatrixTypes = {5, 1};
1510 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
1511 break;
1512 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1513 NeedReturnType = true;
1514 ArgsForMatchingMatrixTypes = {1, 5};
1515 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
1516 break;
1517 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1518 ArgsForMatchingMatrixTypes = {3, 0};
1519 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
1520 break;
1521 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1522 ArgsForMatchingMatrixTypes = {3, 0};
1523 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
1524 break;
1525 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1526 ArgsForMatchingMatrixTypes = {3, 0};
1527 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
1528 break;
1529 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1530 ArgsForMatchingMatrixTypes = {3, 0};
1531 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
1532 break;
1533 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1534 ArgsForMatchingMatrixTypes = {3, 0};
1535 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
1536 break;
1537 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1538 ArgsForMatchingMatrixTypes = {3, 0};
1539 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
1540 break;
1541 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1542 ArgsForMatchingMatrixTypes = {3, 0};
1543 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
1544 break;
1545 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1546 ArgsForMatchingMatrixTypes = {3, 0};
1547 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
1548 break;
1549 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1550 ArgsForMatchingMatrixTypes = {3, 0};
1551 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
1552 break;
1553 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1554 ArgsForMatchingMatrixTypes = {3, 0};
1555 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
1556 break;
1557 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1558 ArgsForMatchingMatrixTypes = {3, 0};
1559 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
1560 break;
1561 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1562 ArgsForMatchingMatrixTypes = {3, 0};
1563 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
1564 break;
1565 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1566 ArgsForMatchingMatrixTypes = {3, 0};
1567 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
1568 break;
1569 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1570 ArgsForMatchingMatrixTypes = {3, 0};
1571 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
1572 break;
1573 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1574 ArgsForMatchingMatrixTypes = {3, 0};
1575 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
1576 break;
1577 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1578 ArgsForMatchingMatrixTypes = {3, 0};
1579 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
1580 break;
1581 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1582 ArgsForMatchingMatrixTypes = {4, 1};
1583 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
1584 break;
1585 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1586 ArgsForMatchingMatrixTypes = {5, 1, 3};
1587 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4;
1588 break;
1589 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1590 ArgsForMatchingMatrixTypes = {5, 1, 3};
1591 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4;
1592 break;
1593 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1594 ArgsForMatchingMatrixTypes = {5, 1, 3};
1595 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4;
1596 break;
1597 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1598 ArgsForMatchingMatrixTypes = {3, 0, 1};
1599 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
1600 break;
1601 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1602 ArgsForMatchingMatrixTypes = {3, 0, 1};
1603 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4;
1604 break;
1605 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1606 ArgsForMatchingMatrixTypes = {3, 0, 1};
1607 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4;
1608 break;
1609 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1610 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1611 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
1612 break;
1613 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1614 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1615 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
1616 break;
1617 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1618 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1619 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
1620 break;
1621 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1622 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1623 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
1624 break;
1625 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1626 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1627 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
1628 break;
1629 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1630 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1631 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
1632 break;
1633 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1634 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1635 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
1636 break;
1637 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1638 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1639 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
1640 break;
1641 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1642 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1643 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
1644 break;
1645 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1646 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1647 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
1648 break;
1649 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1650 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1651 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
1652 break;
1653 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1654 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1655 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
1656 break;
1657 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1658 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1659 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
1660 break;
1661 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
1662 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1663 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
1664 break;
1665 }
1666
1668 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
1669 Args.push_back(EmitScalarExpr(E->getArg(i)));
1670 if (AppendFalseForOpselArg)
1671 Args.push_back(Builder.getFalse());
1672
1674 if (NeedReturnType)
1675 ArgTypes.push_back(ConvertType(E->getType()));
1676 for (auto ArgIdx : ArgsForMatchingMatrixTypes)
1677 ArgTypes.push_back(Args[ArgIdx]->getType());
1678
1679 Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
1680 return Builder.CreateCall(F, Args);
1681 }
1682 // amdgcn workgroup size
1683 case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
1684 return EmitAMDGPUWorkGroupSize(*this, 0);
1685 case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
1686 return EmitAMDGPUWorkGroupSize(*this, 1);
1687 case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
1688 return EmitAMDGPUWorkGroupSize(*this, 2);
1689
1690 // amdgcn grid size
1691 case AMDGPU::BI__builtin_amdgcn_grid_size_x:
1692 return EmitAMDGPUGridSize(*this, 0);
1693 case AMDGPU::BI__builtin_amdgcn_grid_size_y:
1694 return EmitAMDGPUGridSize(*this, 1);
1695 case AMDGPU::BI__builtin_amdgcn_grid_size_z:
1696 return EmitAMDGPUGridSize(*this, 2);
1697
1698 // r600 intrinsics
1699 case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
1700 case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
1702 Intrinsic::r600_recipsqrt_ieee);
1703 case AMDGPU::BI__builtin_amdgcn_alignbit: {
1704 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
1705 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
1706 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
1707 Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
1708 return Builder.CreateCall(F, { Src0, Src1, Src2 });
1709 }
1710 case AMDGPU::BI__builtin_amdgcn_fence: {
1712 EmitScalarExpr(E->getArg(1)), AO, SSID);
1713 FenceInst *Fence = Builder.CreateFence(AO, SSID);
1714 if (E->getNumArgs() > 2)
1716 return Fence;
1717 }
1718 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1719 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1720 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1721 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1722 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1723 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1724 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1725 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1726 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1727 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1728 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1729 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1730 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1731 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1732 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1733 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1734 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1735 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1736 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1737 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1738 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1739 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1740 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1741 llvm::AtomicRMWInst::BinOp BinOp;
1742 switch (BuiltinID) {
1743 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1744 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1745 BinOp = llvm::AtomicRMWInst::UIncWrap;
1746 break;
1747 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1748 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1749 BinOp = llvm::AtomicRMWInst::UDecWrap;
1750 break;
1751 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1752 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1753 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1754 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1755 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1756 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1757 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1758 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1759 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1760 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1761 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1762 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1763 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1764 BinOp = llvm::AtomicRMWInst::FAdd;
1765 break;
1766 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1767 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1768 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1769 BinOp = llvm::AtomicRMWInst::FMin;
1770 break;
1771 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1772 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1773 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1774 BinOp = llvm::AtomicRMWInst::FMax;
1775 break;
1776 }
1777
1778 Address Ptr = CheckAtomicAlignment(*this, E);
1779 Value *Val = EmitScalarExpr(E->getArg(1));
1780 llvm::Type *OrigTy = Val->getType();
1781 QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
1782
1783 bool Volatile;
1784
1785 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
1786 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
1787 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1788 // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1789 Volatile =
1790 cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
1791 } else {
1792 // Infer volatile from the passed type.
1793 Volatile =
1794 PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1795 }
1796
1797 if (E->getNumArgs() >= 4) {
1798 // Some of the builtins have explicit ordering and scope arguments.
1800 EmitScalarExpr(E->getArg(3)), AO, SSID);
1801 } else {
1802 // Most of the builtins do not have syncscope/order arguments. For DS
1803 // atomics the scope doesn't really matter, as they implicitly operate at
1804 // workgroup scope.
1805 //
1806 // The global/flat cases need to use agent scope to consistently produce
1807 // the native instruction instead of a cmpxchg expansion.
1808 if (getTarget().getTriple().isSPIRV())
1809 SSID = getLLVMContext().getOrInsertSyncScopeID("device");
1810 else
1811 SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1812 AO = AtomicOrdering::Monotonic;
1813
1814 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
1815 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
1816 BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
1817 BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1818 llvm::Type *V2BF16Ty = FixedVectorType::get(
1819 llvm::Type::getBFloatTy(Builder.getContext()), 2);
1820 Val = Builder.CreateBitCast(Val, V2BF16Ty);
1821 }
1822 }
1823
1824 llvm::AtomicRMWInst *RMW =
1825 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1826 if (Volatile)
1827 RMW->setVolatile(true);
1828
1829 unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1830 if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1831 // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1832 // instruction for flat and global operations.
1833 llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
1834 RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
1835
1836 // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1837 // instruction, but this only matters for float fadd.
1838 if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1839 RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
1840 }
1841
1842 return Builder.CreateBitCast(RMW, OrigTy);
1843 }
1844 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1845 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1846 llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
1847 llvm::Type *ResultType = ConvertType(E->getType());
1848 // s_sendmsg_rtn is mangled using return type only.
1849 Function *F =
1850 CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
1851 return Builder.CreateCall(F, {Arg});
1852 }
1853 case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1854 case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1855 // Because builtin types are limited, and the intrinsic uses a struct/pair
1856 // output, marshal the pair-of-i32 to <2 x i32>.
1857 Value *VDstOld = EmitScalarExpr(E->getArg(0));
1858 Value *VSrcOld = EmitScalarExpr(E->getArg(1));
1859 Value *FI = EmitScalarExpr(E->getArg(2));
1860 Value *BoundCtrl = EmitScalarExpr(E->getArg(3));
1861 Function *F =
1862 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1863 ? Intrinsic::amdgcn_permlane16_swap
1864 : Intrinsic::amdgcn_permlane32_swap);
1865 llvm::CallInst *Call =
1866 Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl});
1867
1868 llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0);
1869 llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1);
1870
1871 llvm::Type *ResultType = ConvertType(E->getType());
1872
1873 llvm::Value *Insert0 = Builder.CreateInsertElement(
1874 llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0));
1875 llvm::Value *AsVector =
1876 Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1));
1877 return AsVector;
1878 }
1879 case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1880 case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1882 Intrinsic::amdgcn_bitop3);
1883 case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1884 // TODO: LLVM has this overloaded to allow for fat pointers, but since
1885 // those haven't been plumbed through to Clang yet, default to creating the
1886 // resource type.
1888 for (unsigned I = 0; I < 4; ++I)
1889 Args.push_back(EmitScalarExpr(E->getArg(I)));
1890 llvm::PointerType *RetTy = llvm::PointerType::get(
1891 Builder.getContext(), llvm::AMDGPUAS::BUFFER_RESOURCE);
1892 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc,
1893 {RetTy, Args[0]->getType()});
1894 return Builder.CreateCall(F, Args);
1895 }
1896 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1897 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1898 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1899 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1900 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1901 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1903 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store);
1904 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1905 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1906 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1907 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1908 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1909 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1910 llvm::Type *RetTy = nullptr;
1911 switch (BuiltinID) {
1912 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1913 RetTy = Int8Ty;
1914 break;
1915 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1916 RetTy = Int16Ty;
1917 break;
1918 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1919 RetTy = Int32Ty;
1920 break;
1921 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1922 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/2);
1923 break;
1924 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1925 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/3);
1926 break;
1927 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1928 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/4);
1929 break;
1930 }
1931 Function *F =
1932 CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy);
1933 return Builder.CreateCall(
1934 F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
1935 EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
1936 }
1937 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32:
1939 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_add);
1940 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32:
1941 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16:
1943 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd);
1944 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32:
1945 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f64:
1947 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin);
1948 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32:
1949 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f64:
1951 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax);
1952 case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1954 *this, E, Intrinsic::amdgcn_s_prefetch_data);
1955 case Builtin::BIlogbf:
1956 case Builtin::BI__builtin_logbf: {
1957 Value *Src0 = EmitScalarExpr(E->getArg(0));
1958 Function *FrExpFunc = CGM.getIntrinsic(
1959 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1960 CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0);
1961 Value *Exp = Builder.CreateExtractValue(FrExp, 1);
1962 Value *Add = Builder.CreateAdd(
1963 Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true);
1964 Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getFloatTy());
1965 Value *Fabs =
1966 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1967 Value *FCmpONE = Builder.CreateFCmpONE(
1968 Fabs, ConstantFP::getInfinity(Builder.getFloatTy()));
1969 Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs);
1970 Value *FCmpOEQ =
1971 Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getFloatTy()));
1972 Value *Sel2 = Builder.CreateSelect(
1973 FCmpOEQ,
1974 ConstantFP::getInfinity(Builder.getFloatTy(), /*Negative=*/true), Sel1);
1975 return Sel2;
1976 }
1977 case Builtin::BIlogb:
1978 case Builtin::BI__builtin_logb: {
1979 Value *Src0 = EmitScalarExpr(E->getArg(0));
1980 Function *FrExpFunc = CGM.getIntrinsic(
1981 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1982 CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0);
1983 Value *Exp = Builder.CreateExtractValue(FrExp, 1);
1984 Value *Add = Builder.CreateAdd(
1985 Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true);
1986 Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getDoubleTy());
1987 Value *Fabs =
1988 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1989 Value *FCmpONE = Builder.CreateFCmpONE(
1990 Fabs, ConstantFP::getInfinity(Builder.getDoubleTy()));
1991 Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs);
1992 Value *FCmpOEQ =
1993 Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getDoubleTy()));
1994 Value *Sel2 = Builder.CreateSelect(
1995 FCmpOEQ,
1996 ConstantFP::getInfinity(Builder.getDoubleTy(), /*Negative=*/true),
1997 Sel1);
1998 return Sel2;
1999 }
2000 case Builtin::BIscalbnf:
2001 case Builtin::BI__builtin_scalbnf:
2002 case Builtin::BIscalbn:
2003 case Builtin::BI__builtin_scalbn:
2005 *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp);
2006 default:
2007 return nullptr;
2008 }
2009}
#define V(N, I)
static Value * emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E, Intrinsic::ID IntrinsicID, Intrinsic::ID ConstrainedIntrinsicID)
Address CheckAtomicAlignment(CodeGenFunction &CGF, const CallExpr *E)
llvm::Value * emitBuiltinWithOneOverloadedType(clang::CodeGen::CodeGenFunction &CGF, const clang::CallExpr *E, unsigned IntrinsicID, llvm::StringRef Name="")
Definition CGBuiltin.h:63
static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID)
Definition AMDGPU.cpp:386
static StringRef mapScopeToSPIRV(StringRef AMDGCNScope)
Definition AMDGPU.cpp:267
static llvm::Value * loadTextureDescPtorAsVec8I32(CodeGenFunction &CGF, llvm::Value *RsrcPtr)
Definition AMDGPU.cpp:188
static Value * EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E, llvm::Type *RegisterType, llvm::Type *ValueType, bool isExecHi)
Definition AMDGPU.cpp:170
llvm::CallInst * emitAMDGCNImageOverloadedReturnType(clang::CodeGen::CodeGenFunction &CGF, const clang::CallExpr *E, unsigned IntrinsicID, bool IsImageStore)
Definition AMDGPU.cpp:218
static Value * emitFPIntBuiltin(CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID)
Definition AMDGPU.cpp:257
TokenType getType() const
Returns the token's type, e.g.
#define X(type, name)
Definition Value.h:97
HLSLResourceBindingAttr::RegisterType RegisterType
Definition SemaHLSL.cpp:58
static QualType getPointeeType(const MemRegion *R)
Provides definitions for the atomic synchronization scopes.
Enumerates target-specific builtins in their own namespaces within namespace clang.
QualType GetBuiltinType(unsigned ID, GetBuiltinTypeError &Error, unsigned *IntegerConstantArgs=nullptr) const
Return the type for the specified builtin.
unsigned getTargetAddressSpace(LangAS AS) const
@ GE_None
No error.
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition Expr.h:2877
Expr * getArg(unsigned Arg)
getArg - Return the specified argument.
Definition Expr.h:3081
unsigned getNumArgs() const
getNumArgs - Return the number of actual arguments to this call.
Definition Expr.h:3068
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition CharUnits.h:63
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition Address.h:128
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition Address.h:209
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition Address.h:204
Address CreateGEP(CodeGenFunction &CGF, Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition CGBuilder.h:296
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition CGBuilder.h:112
llvm::LoadInst * CreateAlignedLoad(llvm::Type *Ty, llvm::Value *Addr, CharUnits Align, const llvm::Twine &Name="")
Definition CGBuilder.h:132
Address CreateAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition CGBuilder.h:193
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Value * EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx, const CallExpr *E)
Definition AMDGPU.cpp:346
llvm::Type * ConvertType(QualType T)
llvm::Value * EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E)
Definition AMDGPU.cpp:420
const TargetInfo & getTarget() const
void AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, const CallExpr *E)
Definition AMDGPU.cpp:363
Address EmitPointerWithAlignment(const Expr *Addr, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitPointerWithAlignment - Given an expression with a pointer type, emit the value and compute our be...
Definition CGExpr.cpp:1552
llvm::Value * EmitScalarExpr(const Expr *E, bool IgnoreResultAssign=false)
EmitScalarExpr - Emit the computation of the specified expression of LLVM scalar type,...
llvm::LLVMContext & getLLVMContext()
void ProcessOrderScopeAMDGCN(llvm::Value *Order, llvm::Value *Scope, llvm::AtomicOrdering &AO, llvm::SyncScope::ID &SSID)
Definition AMDGPU.cpp:281
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
const llvm::DataLayout & getDataLayout() const
ASTContext & getContext() const
llvm::Function * getIntrinsic(unsigned IID, ArrayRef< llvm::Type * > Tys={})
Expr * IgnoreParenCasts() LLVM_READONLY
Skip past any parentheses and casts which might surround this expression until reaching a fixed point...
Definition Expr.cpp:3090
std::optional< llvm::APSInt > getIntegerConstantExpr(const ASTContext &Ctx) const
isIntegerConstantExpr - Return the value if this expression is a valid integer constant expression.
Expr * IgnoreImpCasts() LLVM_READONLY
Skip past any implicit casts which might surround this expression until reaching a fixed point.
Definition Expr.cpp:3065
SourceLocation getExprLoc() const LLVM_READONLY
getExprLoc - Return the preferred location for the arrow when diagnosing a problem with a generic exp...
Definition Expr.cpp:273
QualType getType() const
Definition Expr.h:144
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition TypeBase.h:3328
A (possibly-)qualified type.
Definition TypeBase.h:937
bool isVolatileQualified() const
Determine whether this type is volatile-qualified.
Definition TypeBase.h:8362
QualType getCanonicalType() const
Definition TypeBase.h:8330
Scope - A scope is a transient data structure that is used while parsing the program.
Definition Scope.h:41
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition TargetInfo.h:324
const llvm::Triple & getTriple() const
Returns the target triple of the primary target.
unsigned getMaxOpenCLWorkGroupSize() const
Definition TargetInfo.h:873
llvm::CodeObjectVersionKind CodeObjectVersion
Code object version for AMDGPU.
QualType getType() const
Definition Value.cpp:237
The JSON file list parser is used to communicate input to InstallAPI.
@ Result
The result type of a method or function.
Definition TypeBase.h:905
U cast(CodeGen::Address addr)
Definition Address.h:327
Diagnostic wrappers for TextAPI types for error reporting.
Definition Dominators.h:30
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64