clang 22.0.0git
AMDGPU.cpp
Go to the documentation of this file.
1//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
14#include "CodeGenFunction.h"
17#include "llvm/Analysis/ValueTracking.h"
18#include "llvm/CodeGen/MachineFunction.h"
19#include "llvm/IR/IntrinsicsAMDGPU.h"
20#include "llvm/IR/IntrinsicsR600.h"
21#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
22#include "llvm/Support/AMDGPUAddrSpace.h"
23
24using namespace clang;
25using namespace CodeGen;
26using namespace llvm;
27
28namespace {
29
30// Has second type mangled argument.
31static Value *
33 Intrinsic::ID IntrinsicID,
34 Intrinsic::ID ConstrainedIntrinsicID) {
35 llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
36 llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
37
38 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
39 if (CGF.Builder.getIsFPConstrained()) {
40 Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
41 {Src0->getType(), Src1->getType()});
42 return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1});
43 }
44
45 Function *F =
46 CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()});
47 return CGF.Builder.CreateCall(F, {Src0, Src1});
48}
49
50// If \p E is not null pointer, insert address space cast to match return
51// type of \p E if necessary.
52Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
53 const CallExpr *E = nullptr) {
54 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
55 auto *Call = CGF.Builder.CreateCall(F);
56 Call->addRetAttr(
57 Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
58 Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
59 if (!E)
60 return Call;
61 QualType BuiltinRetType = E->getType();
62 auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
63 if (RetTy == Call->getType())
64 return Call;
65 return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
66}
67
68Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
69 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
70 auto *Call = CGF.Builder.CreateCall(F);
71 Call->addRetAttr(
72 Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
73 Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
74 return Call;
75}
76
77// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
78/// Emit code based on Code Object ABI version.
79/// COV_4 : Emit code to use dispatch ptr
80/// COV_5+ : Emit code to use implicitarg ptr
81/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
82/// and use its value for COV_4 or COV_5+ approach. It is used for
83/// compiling device libraries in an ABI-agnostic way.
84Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
85 llvm::LoadInst *LD;
86
87 auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
88
89 if (Cov == CodeObjectVersionKind::COV_None) {
90 StringRef Name = "__oclc_ABI_version";
91 auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
92 if (!ABIVersionC)
93 ABIVersionC = new llvm::GlobalVariable(
94 CGF.CGM.getModule(), CGF.Int32Ty, false,
95 llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
96 llvm::GlobalVariable::NotThreadLocal,
98
99 // This load will be eliminated by the IPSCCP because it is constant
100 // weak_odr without externally_initialized. Either changing it to weak or
101 // adding externally_initialized will keep the load.
102 Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
103 CGF.CGM.getIntAlign());
104
105 Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
106 ABIVersion,
107 llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5));
108
109 // Indexing the implicit kernarg segment.
110 Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
111 CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
112
113 // Indexing the HSA kernel_dispatch_packet struct.
114 Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
115 CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
116
117 auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
118 LD = CGF.Builder.CreateLoad(
119 Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
120 } else {
121 Value *GEP = nullptr;
122 if (Cov >= CodeObjectVersionKind::COV_5) {
123 // Indexing the implicit kernarg segment.
124 GEP = CGF.Builder.CreateConstGEP1_32(
125 CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
126 } else {
127 // Indexing the HSA kernel_dispatch_packet struct.
128 GEP = CGF.Builder.CreateConstGEP1_32(
129 CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
130 }
131 LD = CGF.Builder.CreateLoad(
133 }
134
135 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
136 llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
137 APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
138 LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
139 LD->setMetadata(llvm::LLVMContext::MD_noundef,
140 llvm::MDNode::get(CGF.getLLVMContext(), {}));
141 LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
142 llvm::MDNode::get(CGF.getLLVMContext(), {}));
143 return LD;
144}
145
146// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
147Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
148 const unsigned XOffset = 12;
149 auto *DP = EmitAMDGPUDispatchPtr(CGF);
150 // Indexing the HSA kernel_dispatch_packet struct.
151 auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
152 auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
153 auto *LD = CGF.Builder.CreateLoad(
155
156 llvm::MDBuilder MDB(CGF.getLLVMContext());
157
158 // Known non-zero.
159 LD->setMetadata(llvm::LLVMContext::MD_range,
160 MDB.createRange(APInt(32, 1), APInt::getZero(32)));
161 LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
162 llvm::MDNode::get(CGF.getLLVMContext(), {}));
163 return LD;
164}
165} // namespace
166
167// Generates the IR for __builtin_read_exec_*.
168// Lowers the builtin to amdgcn_ballot intrinsic.
170 llvm::Type *RegisterType,
171 llvm::Type *ValueType, bool isExecHi) {
172 CodeGen::CGBuilderTy &Builder = CGF.Builder;
173 CodeGen::CodeGenModule &CGM = CGF.CGM;
174
175 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType});
176 llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)});
177
178 if (isExecHi) {
179 Value *Rt2 = Builder.CreateLShr(Call, 32);
180 Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty);
181 return Rt2;
182 }
183
184 return Call;
185}
186
188 llvm::Value *RsrcPtr) {
189 auto &B = CGF.Builder;
190 auto *VecTy = llvm::FixedVectorType::get(B.getInt32Ty(), 8);
191
192 if (RsrcPtr->getType() == VecTy)
193 return RsrcPtr;
194
195 if (RsrcPtr->getType()->isIntegerTy(32)) {
196 llvm::PointerType *VecPtrTy =
197 llvm::PointerType::get(CGF.getLLVMContext(), 8);
198 llvm::Value *Ptr = B.CreateIntToPtr(RsrcPtr, VecPtrTy, "tex.rsrc.from.int");
199 return B.CreateAlignedLoad(VecTy, Ptr, llvm::Align(32), "tex.rsrc.val");
200 }
201
202 if (RsrcPtr->getType()->isPointerTy()) {
203 auto *VecPtrTy = llvm::PointerType::get(
204 CGF.getLLVMContext(), RsrcPtr->getType()->getPointerAddressSpace());
205 llvm::Value *Typed = B.CreateBitCast(RsrcPtr, VecPtrTy, "tex.rsrc.typed");
206 return B.CreateAlignedLoad(VecTy, Typed, llvm::Align(32), "tex.rsrc.val");
207 }
208
209 const auto &DL = CGF.CGM.getDataLayout();
210 if (DL.getTypeSizeInBits(RsrcPtr->getType()) == 256)
211 return B.CreateBitCast(RsrcPtr, VecTy, "tex.rsrc.val");
212
213 llvm::report_fatal_error("Unexpected texture resource argument form");
214}
215
216llvm::CallInst *
218 const clang::CallExpr *E,
219 unsigned IntrinsicID, bool IsImageStore) {
220 auto findTextureDescIndex = [&CGF](const CallExpr *E) -> unsigned {
221 QualType TexQT = CGF.getContext().AMDGPUTextureTy;
222 for (unsigned I = 0, N = E->getNumArgs(); I < N; ++I) {
223 QualType ArgTy = E->getArg(I)->getType();
224 if (ArgTy == TexQT) {
225 return I;
226 }
227
228 if (ArgTy.getCanonicalType() == TexQT.getCanonicalType()) {
229 return I;
230 }
231 }
232
233 return ~0U;
234 };
235
237 unsigned RsrcIndex = findTextureDescIndex(E);
238
239 if (RsrcIndex == ~0U) {
240 llvm::report_fatal_error("Invalid argument count for image builtin");
241 }
242
243 for (unsigned I = 0; I < E->getNumArgs(); ++I) {
244 llvm::Value *V = CGF.EmitScalarExpr(E->getArg(I));
245 if (I == RsrcIndex)
247 Args.push_back(V);
248 }
249
250 llvm::Type *RetTy = IsImageStore ? CGF.VoidTy : CGF.ConvertType(E->getType());
251 llvm::CallInst *Call = CGF.Builder.CreateIntrinsic(RetTy, IntrinsicID, Args);
252 return Call;
253}
254
255// Emit an intrinsic that has 1 float or double operand, and 1 integer.
257 const CallExpr *E,
258 unsigned IntrinsicID) {
259 llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
260 llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
261
262 Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
263 return CGF.Builder.CreateCall(F, {Src0, Src1});
264}
265
266static inline StringRef mapScopeToSPIRV(StringRef AMDGCNScope) {
267 if (AMDGCNScope == "agent")
268 return "device";
269 if (AMDGCNScope == "wavefront")
270 return "subgroup";
271 return AMDGCNScope;
272}
273
274// For processing memory ordering and memory scope arguments of various
275// amdgcn builtins.
276// \p Order takes a C++11 compatible memory-ordering specifier and converts
277// it into LLVM's memory ordering specifier using atomic C ABI, and writes
278// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
279// specific SyncScopeID and writes it to \p SSID.
281 llvm::AtomicOrdering &AO,
282 llvm::SyncScope::ID &SSID) {
283 int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
284
285 // Map C11/C++11 memory ordering to LLVM memory ordering
286 assert(llvm::isValidAtomicOrderingCABI(ord));
287 switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
288 case llvm::AtomicOrderingCABI::acquire:
289 case llvm::AtomicOrderingCABI::consume:
290 AO = llvm::AtomicOrdering::Acquire;
291 break;
292 case llvm::AtomicOrderingCABI::release:
293 AO = llvm::AtomicOrdering::Release;
294 break;
295 case llvm::AtomicOrderingCABI::acq_rel:
296 AO = llvm::AtomicOrdering::AcquireRelease;
297 break;
298 case llvm::AtomicOrderingCABI::seq_cst:
299 AO = llvm::AtomicOrdering::SequentiallyConsistent;
300 break;
301 case llvm::AtomicOrderingCABI::relaxed:
302 AO = llvm::AtomicOrdering::Monotonic;
303 break;
304 }
305
306 // Some of the atomic builtins take the scope as a string name.
307 StringRef scp;
308 if (llvm::getConstantStringInfo(Scope, scp)) {
309 if (getTarget().getTriple().isSPIRV())
310 scp = mapScopeToSPIRV(scp);
311 SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
312 return;
313 }
314
315 // Older builtins had an enum argument for the memory scope.
316 int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
317 switch (scope) {
318 case 0: // __MEMORY_SCOPE_SYSTEM
319 SSID = llvm::SyncScope::System;
320 break;
321 case 1: // __MEMORY_SCOPE_DEVICE
322 if (getTarget().getTriple().isSPIRV())
323 SSID = getLLVMContext().getOrInsertSyncScopeID("device");
324 else
325 SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
326 break;
327 case 2: // __MEMORY_SCOPE_WRKGRP
328 SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
329 break;
330 case 3: // __MEMORY_SCOPE_WVFRNT
331 if (getTarget().getTriple().isSPIRV())
332 SSID = getLLVMContext().getOrInsertSyncScopeID("subgroup");
333 else
334 SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
335 break;
336 case 4: // __MEMORY_SCOPE_SINGLE
337 SSID = llvm::SyncScope::SingleThread;
338 break;
339 default:
340 SSID = llvm::SyncScope::System;
341 break;
342 }
343}
344
345llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
346 unsigned Idx,
347 const CallExpr *E) {
348 llvm::Value *Arg = nullptr;
349 if ((ICEArguments & (1 << Idx)) == 0) {
350 Arg = EmitScalarExpr(E->getArg(Idx));
351 } else {
352 // If this is required to be a constant, constant fold it so that we
353 // know that the generated intrinsic gets a ConstantInt.
354 std::optional<llvm::APSInt> Result =
356 assert(Result && "Expected argument to be a constant");
357 Arg = llvm::ConstantInt::get(getLLVMContext(), *Result);
358 }
359 return Arg;
360}
361
363 const CallExpr *E) {
364 constexpr const char *Tag = "amdgpu-synchronize-as";
365
366 LLVMContext &Ctx = Inst->getContext();
368 for (unsigned K = 2; K < E->getNumArgs(); ++K) {
369 llvm::Value *V = EmitScalarExpr(E->getArg(K));
370 StringRef AS;
371 if (llvm::getConstantStringInfo(V, AS)) {
372 MMRAs.push_back({Tag, AS});
373 // TODO: Delete the resulting unused constant?
374 continue;
375 }
376 CGM.Error(E->getExprLoc(),
377 "expected an address space name as a string literal");
378 }
379
380 llvm::sort(MMRAs);
381 MMRAs.erase(llvm::unique(MMRAs), MMRAs.end());
382 Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
383}
384
385static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
386 switch (BuiltinID) {
387 default:
388 llvm_unreachable("Unknown BuiltinID for wave reduction");
389 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
390 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
391 return Intrinsic::amdgcn_wave_reduce_add;
392 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
393 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
394 return Intrinsic::amdgcn_wave_reduce_sub;
395 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
396 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
397 return Intrinsic::amdgcn_wave_reduce_min;
398 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
399 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
400 return Intrinsic::amdgcn_wave_reduce_umin;
401 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
402 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
403 return Intrinsic::amdgcn_wave_reduce_max;
404 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
405 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
406 return Intrinsic::amdgcn_wave_reduce_umax;
407 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
408 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
409 return Intrinsic::amdgcn_wave_reduce_and;
410 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
411 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
412 return Intrinsic::amdgcn_wave_reduce_or;
413 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
414 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64:
415 return Intrinsic::amdgcn_wave_reduce_xor;
416 }
417}
418
420 const CallExpr *E) {
421 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
422 llvm::SyncScope::ID SSID;
423 switch (BuiltinID) {
424 case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
425 case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
426 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
427 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
428 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
429 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
430 case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
431 case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
432 case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
433 case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
434 case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
435 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
436 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
437 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
438 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
439 case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
440 case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
441 case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: {
442 Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
443 llvm::Value *Value = EmitScalarExpr(E->getArg(0));
444 llvm::Value *Strategy = EmitScalarExpr(E->getArg(1));
445 llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
446 return Builder.CreateCall(F, {Value, Strategy});
447 }
448 case AMDGPU::BI__builtin_amdgcn_div_scale:
449 case AMDGPU::BI__builtin_amdgcn_div_scalef: {
450 // Translate from the intrinsics's struct return to the builtin's out
451 // argument.
452
453 Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
454
455 llvm::Value *X = EmitScalarExpr(E->getArg(0));
456 llvm::Value *Y = EmitScalarExpr(E->getArg(1));
457 llvm::Value *Z = EmitScalarExpr(E->getArg(2));
458
459 llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
460 X->getType());
461
462 llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
463
464 llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
465 llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
466
467 llvm::Type *RealFlagType = FlagOutPtr.getElementType();
468
469 llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
470 Builder.CreateStore(FlagExt, FlagOutPtr);
471 return Result;
472 }
473 case AMDGPU::BI__builtin_amdgcn_div_fmas:
474 case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
475 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
476 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
477 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
478 llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
479
480 llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
481 Src0->getType());
482 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
483 return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
484 }
485
486 case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
488 Intrinsic::amdgcn_ds_swizzle);
489 case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
490 case AMDGPU::BI__builtin_amdgcn_mov_dpp:
491 case AMDGPU::BI__builtin_amdgcn_update_dpp: {
493 // Find out if any arguments are required to be integer constant
494 // expressions.
495 unsigned ICEArguments = 0;
497 getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
498 assert(Error == ASTContext::GE_None && "Should not codegen an error");
499 llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
500 unsigned Size = DataTy->getPrimitiveSizeInBits();
501 llvm::Type *IntTy =
502 llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
503 Function *F =
504 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
505 ? Intrinsic::amdgcn_mov_dpp8
506 : Intrinsic::amdgcn_update_dpp,
507 IntTy);
508 assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
509 E->getNumArgs() == 2);
510 bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
511 if (InsertOld)
512 Args.push_back(llvm::PoisonValue::get(IntTy));
513 for (unsigned I = 0; I != E->getNumArgs(); ++I) {
514 llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
515 if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
516 Size < 32) {
517 if (!DataTy->isIntegerTy())
518 V = Builder.CreateBitCast(
519 V, llvm::IntegerType::get(Builder.getContext(), Size));
520 V = Builder.CreateZExtOrBitCast(V, IntTy);
521 }
522 llvm::Type *ExpTy =
523 F->getFunctionType()->getFunctionParamType(I + InsertOld);
524 Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy));
525 }
526 Value *V = Builder.CreateCall(F, Args);
527 if (Size < 32 && !DataTy->isIntegerTy())
528 V = Builder.CreateTrunc(
529 V, llvm::IntegerType::get(Builder.getContext(), Size));
530 return Builder.CreateTruncOrBitCast(V, DataTy);
531 }
532 case AMDGPU::BI__builtin_amdgcn_permlane16:
533 case AMDGPU::BI__builtin_amdgcn_permlanex16:
535 *this, E,
536 BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
537 ? Intrinsic::amdgcn_permlane16
538 : Intrinsic::amdgcn_permlanex16);
539 case AMDGPU::BI__builtin_amdgcn_permlane64:
541 Intrinsic::amdgcn_permlane64);
542 case AMDGPU::BI__builtin_amdgcn_readlane:
544 Intrinsic::amdgcn_readlane);
545 case AMDGPU::BI__builtin_amdgcn_readfirstlane:
547 Intrinsic::amdgcn_readfirstlane);
548 case AMDGPU::BI__builtin_amdgcn_div_fixup:
549 case AMDGPU::BI__builtin_amdgcn_div_fixupf:
550 case AMDGPU::BI__builtin_amdgcn_div_fixuph:
552 Intrinsic::amdgcn_div_fixup);
553 case AMDGPU::BI__builtin_amdgcn_trig_preop:
554 case AMDGPU::BI__builtin_amdgcn_trig_preopf:
555 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
556 case AMDGPU::BI__builtin_amdgcn_rcp:
557 case AMDGPU::BI__builtin_amdgcn_rcpf:
558 case AMDGPU::BI__builtin_amdgcn_rcph:
559 case AMDGPU::BI__builtin_amdgcn_rcp_bf16:
560 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rcp);
561 case AMDGPU::BI__builtin_amdgcn_sqrt:
562 case AMDGPU::BI__builtin_amdgcn_sqrtf:
563 case AMDGPU::BI__builtin_amdgcn_sqrth:
564 case AMDGPU::BI__builtin_amdgcn_sqrt_bf16:
566 Intrinsic::amdgcn_sqrt);
567 case AMDGPU::BI__builtin_amdgcn_rsq:
568 case AMDGPU::BI__builtin_amdgcn_rsqf:
569 case AMDGPU::BI__builtin_amdgcn_rsqh:
570 case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
571 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq);
572 case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
573 case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
575 Intrinsic::amdgcn_rsq_clamp);
576 case AMDGPU::BI__builtin_amdgcn_sinf:
577 case AMDGPU::BI__builtin_amdgcn_sinh:
578 case AMDGPU::BI__builtin_amdgcn_sin_bf16:
579 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin);
580 case AMDGPU::BI__builtin_amdgcn_cosf:
581 case AMDGPU::BI__builtin_amdgcn_cosh:
582 case AMDGPU::BI__builtin_amdgcn_cos_bf16:
583 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos);
584 case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
585 return EmitAMDGPUDispatchPtr(*this, E);
586 case AMDGPU::BI__builtin_amdgcn_logf:
587 case AMDGPU::BI__builtin_amdgcn_log_bf16:
588 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
589 case AMDGPU::BI__builtin_amdgcn_exp2f:
590 case AMDGPU::BI__builtin_amdgcn_exp2_bf16:
592 Intrinsic::amdgcn_exp2);
593 case AMDGPU::BI__builtin_amdgcn_log_clampf:
595 Intrinsic::amdgcn_log_clamp);
596 case AMDGPU::BI__builtin_amdgcn_ldexp:
597 case AMDGPU::BI__builtin_amdgcn_ldexpf: {
598 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
599 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
600 llvm::Function *F =
601 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
602 return Builder.CreateCall(F, {Src0, Src1});
603 }
604 case AMDGPU::BI__builtin_amdgcn_ldexph: {
605 // The raw instruction has a different behavior for out of bounds exponent
606 // values (implicit truncation instead of saturate to short_min/short_max).
607 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
608 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
609 llvm::Function *F =
610 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
611 return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)});
612 }
613 case AMDGPU::BI__builtin_amdgcn_frexp_mant:
614 case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
615 case AMDGPU::BI__builtin_amdgcn_frexp_manth:
617 Intrinsic::amdgcn_frexp_mant);
618 case AMDGPU::BI__builtin_amdgcn_frexp_exp:
619 case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
620 Value *Src0 = EmitScalarExpr(E->getArg(0));
621 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
622 { Builder.getInt32Ty(), Src0->getType() });
623 return Builder.CreateCall(F, Src0);
624 }
625 case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
626 Value *Src0 = EmitScalarExpr(E->getArg(0));
627 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
628 { Builder.getInt16Ty(), Src0->getType() });
629 return Builder.CreateCall(F, Src0);
630 }
631 case AMDGPU::BI__builtin_amdgcn_fract:
632 case AMDGPU::BI__builtin_amdgcn_fractf:
633 case AMDGPU::BI__builtin_amdgcn_fracth:
635 Intrinsic::amdgcn_fract);
636 case AMDGPU::BI__builtin_amdgcn_lerp:
638 Intrinsic::amdgcn_lerp);
639 case AMDGPU::BI__builtin_amdgcn_ubfe:
641 Intrinsic::amdgcn_ubfe);
642 case AMDGPU::BI__builtin_amdgcn_sbfe:
644 Intrinsic::amdgcn_sbfe);
645 case AMDGPU::BI__builtin_amdgcn_ballot_w32:
646 case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
647 llvm::Type *ResultType = ConvertType(E->getType());
648 llvm::Value *Src = EmitScalarExpr(E->getArg(0));
649 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
650 return Builder.CreateCall(F, { Src });
651 }
652 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
653 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
654 llvm::Value *Src = EmitScalarExpr(E->getArg(0));
655 Function *F =
656 CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, {Src->getType()});
657 return Builder.CreateCall(F, {Src});
658 }
659 case AMDGPU::BI__builtin_amdgcn_tanhf:
660 case AMDGPU::BI__builtin_amdgcn_tanhh:
661 case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
663 Intrinsic::amdgcn_tanh);
664 case AMDGPU::BI__builtin_amdgcn_uicmp:
665 case AMDGPU::BI__builtin_amdgcn_uicmpl:
666 case AMDGPU::BI__builtin_amdgcn_sicmp:
667 case AMDGPU::BI__builtin_amdgcn_sicmpl: {
668 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
669 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
670 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
671
672 // FIXME-GFX10: How should 32 bit mask be handled?
673 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
674 { Builder.getInt64Ty(), Src0->getType() });
675 return Builder.CreateCall(F, { Src0, Src1, Src2 });
676 }
677 case AMDGPU::BI__builtin_amdgcn_fcmp:
678 case AMDGPU::BI__builtin_amdgcn_fcmpf: {
679 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
680 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
681 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
682
683 // FIXME-GFX10: How should 32 bit mask be handled?
684 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
685 { Builder.getInt64Ty(), Src0->getType() });
686 return Builder.CreateCall(F, { Src0, Src1, Src2 });
687 }
688 case AMDGPU::BI__builtin_amdgcn_class:
689 case AMDGPU::BI__builtin_amdgcn_classf:
690 case AMDGPU::BI__builtin_amdgcn_classh:
691 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
692 case AMDGPU::BI__builtin_amdgcn_fmed3f:
693 case AMDGPU::BI__builtin_amdgcn_fmed3h:
695 Intrinsic::amdgcn_fmed3);
696 case AMDGPU::BI__builtin_amdgcn_ds_append:
697 case AMDGPU::BI__builtin_amdgcn_ds_consume: {
698 Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
699 Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
700 Value *Src0 = EmitScalarExpr(E->getArg(0));
701 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
702 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
703 }
704 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
705 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
706 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
707 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
708 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
709 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
710 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
711 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
712 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
713 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
714 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
715 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
716 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
717 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
718 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
719 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
720 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
721 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
722 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
723 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
724 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
725 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
726 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
727 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
728 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
729 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
730 Intrinsic::ID IID;
731 switch (BuiltinID) {
732 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
733 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
734 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
735 IID = Intrinsic::amdgcn_global_load_tr_b64;
736 break;
737 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
738 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
739 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
740 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
741 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
742 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
743 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
744 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
745 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
746 IID = Intrinsic::amdgcn_global_load_tr_b128;
747 break;
748 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
749 IID = Intrinsic::amdgcn_global_load_tr4_b64;
750 break;
751 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
752 IID = Intrinsic::amdgcn_global_load_tr6_b96;
753 break;
754 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
755 IID = Intrinsic::amdgcn_ds_load_tr4_b64;
756 break;
757 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
758 IID = Intrinsic::amdgcn_ds_load_tr6_b96;
759 break;
760 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
761 IID = Intrinsic::amdgcn_ds_load_tr8_b64;
762 break;
763 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
764 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
765 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
766 IID = Intrinsic::amdgcn_ds_load_tr16_b128;
767 break;
768 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
769 IID = Intrinsic::amdgcn_ds_read_tr4_b64;
770 break;
771 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
772 IID = Intrinsic::amdgcn_ds_read_tr8_b64;
773 break;
774 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
775 IID = Intrinsic::amdgcn_ds_read_tr6_b96;
776 break;
777 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
778 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
779 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
780 IID = Intrinsic::amdgcn_ds_read_tr16_b64;
781 break;
782 }
783 llvm::Type *LoadTy = ConvertType(E->getType());
784 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
785 llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
786 return Builder.CreateCall(F, {Addr});
787 }
788 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
789 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
790 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
791 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
792 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
793 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128: {
794
795 Intrinsic::ID IID;
796 switch (BuiltinID) {
797 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
798 IID = Intrinsic::amdgcn_global_load_monitor_b32;
799 break;
800 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
801 IID = Intrinsic::amdgcn_global_load_monitor_b64;
802 break;
803 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
804 IID = Intrinsic::amdgcn_global_load_monitor_b128;
805 break;
806 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
807 IID = Intrinsic::amdgcn_flat_load_monitor_b32;
808 break;
809 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
810 IID = Intrinsic::amdgcn_flat_load_monitor_b64;
811 break;
812 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128:
813 IID = Intrinsic::amdgcn_flat_load_monitor_b128;
814 break;
815 }
816
817 llvm::Type *LoadTy = ConvertType(E->getType());
818 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
819 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
820 llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
821 return Builder.CreateCall(F, {Addr, Val});
822 }
823 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
824 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
825 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: {
826 Intrinsic::ID IID;
827 switch (BuiltinID) {
828 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
829 IID = Intrinsic::amdgcn_cluster_load_b32;
830 break;
831 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
832 IID = Intrinsic::amdgcn_cluster_load_b64;
833 break;
834 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128:
835 IID = Intrinsic::amdgcn_cluster_load_b128;
836 break;
837 }
839 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
840 Args.push_back(EmitScalarExpr(E->getArg(i)));
841 llvm::Function *F = CGM.getIntrinsic(IID, {ConvertType(E->getType())});
842 return Builder.CreateCall(F, {Args});
843 }
844 case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
845 // Should this have asan instrumentation?
847 Intrinsic::amdgcn_load_to_lds);
848 }
849 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
850 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
851 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
852 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
853 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
854 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B: {
855 Intrinsic::ID IID;
856 switch (BuiltinID) {
857 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
858 IID = Intrinsic::amdgcn_cooperative_atomic_load_32x4B;
859 break;
860 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
861 IID = Intrinsic::amdgcn_cooperative_atomic_store_32x4B;
862 break;
863 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
864 IID = Intrinsic::amdgcn_cooperative_atomic_load_16x8B;
865 break;
866 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
867 IID = Intrinsic::amdgcn_cooperative_atomic_store_16x8B;
868 break;
869 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
870 IID = Intrinsic::amdgcn_cooperative_atomic_load_8x16B;
871 break;
872 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B:
873 IID = Intrinsic::amdgcn_cooperative_atomic_store_8x16B;
874 break;
875 }
876
877 LLVMContext &Ctx = CGM.getLLVMContext();
879 // last argument is a MD string
880 const unsigned ScopeArg = E->getNumArgs() - 1;
881 for (unsigned i = 0; i != ScopeArg; ++i)
882 Args.push_back(EmitScalarExpr(E->getArg(i)));
883 StringRef Arg = cast<StringLiteral>(E->getArg(ScopeArg)->IgnoreParenCasts())
884 ->getString();
885 llvm::MDNode *MD = llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, Arg)});
886 Args.push_back(llvm::MetadataAsValue::get(Ctx, MD));
887 // Intrinsic is typed based on the pointer AS. Pointer is always the first
888 // argument.
889 llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
890 return Builder.CreateCall(F, {Args});
891 }
892 case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
893 Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
894 {llvm::Type::getInt64Ty(getLLVMContext())});
895 return Builder.CreateCall(F);
896 }
897 case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
898 Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv,
899 {llvm::Type::getInt64Ty(getLLVMContext())});
900 llvm::Value *Env = EmitScalarExpr(E->getArg(0));
901 return Builder.CreateCall(F, {Env});
902 }
903 case AMDGPU::BI__builtin_amdgcn_read_exec:
904 return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
905 case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
906 return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
907 case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
908 return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
909 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
910 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
911 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
912 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
913 llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
914 llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
915 llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2));
916 llvm::Value *RayDir = EmitScalarExpr(E->getArg(3));
917 llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
918 llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));
919
920 // The builtins take these arguments as vec4 where the last element is
921 // ignored. The intrinsic takes them as vec3.
922 RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
923 {0, 1, 2});
924 RayDir =
925 Builder.CreateShuffleVector(RayDir, RayDir, {0, 1, 2});
926 RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
927 {0, 1, 2});
928
929 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
930 {NodePtr->getType(), RayDir->getType()});
931 return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
932 RayInverseDir, TextureDescr});
933 }
934 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
935 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
936 Intrinsic::ID IID;
937 switch (BuiltinID) {
938 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
939 IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
940 break;
941 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
942 IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
943 break;
944 }
945 llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
946 llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
947 llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2));
948 llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3));
949 llvm::Value *RayDir = EmitScalarExpr(E->getArg(4));
950 llvm::Value *Offset = EmitScalarExpr(E->getArg(5));
951 llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6));
952
953 Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7));
954 Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8));
955
956 llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
957
958 llvm::CallInst *CI = Builder.CreateCall(
959 IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
960 Offset, TextureDescr});
961
962 llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0);
963 llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1);
964 llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2);
965
966 Builder.CreateStore(RetRayOrigin, RetRayOriginPtr);
967 Builder.CreateStore(RetRayDir, RetRayDirPtr);
968
969 return RetVData;
970 }
971
972 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
973 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
974 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
975 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
976 Intrinsic::ID IID;
977 switch (BuiltinID) {
978 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
979 IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
980 break;
981 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
982 IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
983 break;
984 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
985 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
986 break;
987 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
988 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
989 break;
990 }
991
993 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
994 Args.push_back(EmitScalarExpr(E->getArg(i)));
995
996 Function *F = CGM.getIntrinsic(IID);
997 Value *Call = Builder.CreateCall(F, Args);
998 Value *Rtn = Builder.CreateExtractValue(Call, 0);
999 Value *A = Builder.CreateExtractValue(Call, 1);
1000 llvm::Type *RetTy = ConvertType(E->getType());
1001 Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
1002 (uint64_t)0);
1003 // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
1004 // <2 x i64>, zext the second value.
1005 if (A->getType()->getPrimitiveSizeInBits() <
1006 RetTy->getScalarType()->getPrimitiveSizeInBits())
1007 A = Builder.CreateZExt(A, RetTy->getScalarType());
1008
1009 return Builder.CreateInsertElement(I0, A, 1);
1010 }
1011 case AMDGPU::BI__builtin_amdgcn_image_load_1d_v4f32_i32:
1012 case AMDGPU::BI__builtin_amdgcn_image_load_1d_v4f16_i32:
1014 *this, E, Intrinsic::amdgcn_image_load_1d, false);
1015 case AMDGPU::BI__builtin_amdgcn_image_load_1darray_v4f32_i32:
1016 case AMDGPU::BI__builtin_amdgcn_image_load_1darray_v4f16_i32:
1018 *this, E, Intrinsic::amdgcn_image_load_1darray, false);
1019 case AMDGPU::BI__builtin_amdgcn_image_load_2d_f32_i32:
1020 case AMDGPU::BI__builtin_amdgcn_image_load_2d_v4f32_i32:
1021 case AMDGPU::BI__builtin_amdgcn_image_load_2d_v4f16_i32:
1023 *this, E, Intrinsic::amdgcn_image_load_2d, false);
1024 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_f32_i32:
1025 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_v4f32_i32:
1026 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_v4f16_i32:
1028 *this, E, Intrinsic::amdgcn_image_load_2darray, false);
1029 case AMDGPU::BI__builtin_amdgcn_image_load_3d_v4f32_i32:
1030 case AMDGPU::BI__builtin_amdgcn_image_load_3d_v4f16_i32:
1032 *this, E, Intrinsic::amdgcn_image_load_3d, false);
1033 case AMDGPU::BI__builtin_amdgcn_image_load_cube_v4f32_i32:
1034 case AMDGPU::BI__builtin_amdgcn_image_load_cube_v4f16_i32:
1036 *this, E, Intrinsic::amdgcn_image_load_cube, false);
1037 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1d_v4f32_i32:
1038 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1d_v4f16_i32:
1040 *this, E, Intrinsic::amdgcn_image_load_mip_1d, false);
1041 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1darray_v4f32_i32:
1042 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1darray_v4f16_i32:
1044 *this, E, Intrinsic::amdgcn_image_load_mip_1darray, false);
1045 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_f32_i32:
1046 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_v4f32_i32:
1047 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_v4f16_i32:
1049 *this, E, Intrinsic::amdgcn_image_load_mip_2d, false);
1050 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_f32_i32:
1051 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_v4f32_i32:
1052 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_v4f16_i32:
1054 *this, E, Intrinsic::amdgcn_image_load_mip_2darray, false);
1055 case AMDGPU::BI__builtin_amdgcn_image_load_mip_3d_v4f32_i32:
1056 case AMDGPU::BI__builtin_amdgcn_image_load_mip_3d_v4f16_i32:
1058 *this, E, Intrinsic::amdgcn_image_load_mip_3d, false);
1059 case AMDGPU::BI__builtin_amdgcn_image_load_mip_cube_v4f32_i32:
1060 case AMDGPU::BI__builtin_amdgcn_image_load_mip_cube_v4f16_i32:
1062 *this, E, Intrinsic::amdgcn_image_load_mip_cube, false);
1063 case AMDGPU::BI__builtin_amdgcn_image_store_1d_v4f32_i32:
1064 case AMDGPU::BI__builtin_amdgcn_image_store_1d_v4f16_i32:
1066 *this, E, Intrinsic::amdgcn_image_store_1d, true);
1067 case AMDGPU::BI__builtin_amdgcn_image_store_1darray_v4f32_i32:
1068 case AMDGPU::BI__builtin_amdgcn_image_store_1darray_v4f16_i32:
1070 *this, E, Intrinsic::amdgcn_image_store_1darray, true);
1071 case AMDGPU::BI__builtin_amdgcn_image_store_2d_f32_i32:
1072 case AMDGPU::BI__builtin_amdgcn_image_store_2d_v4f32_i32:
1073 case AMDGPU::BI__builtin_amdgcn_image_store_2d_v4f16_i32:
1075 *this, E, Intrinsic::amdgcn_image_store_2d, true);
1076 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_f32_i32:
1077 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_v4f32_i32:
1078 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_v4f16_i32:
1080 *this, E, Intrinsic::amdgcn_image_store_2darray, true);
1081 case AMDGPU::BI__builtin_amdgcn_image_store_3d_v4f32_i32:
1082 case AMDGPU::BI__builtin_amdgcn_image_store_3d_v4f16_i32:
1084 *this, E, Intrinsic::amdgcn_image_store_3d, true);
1085 case AMDGPU::BI__builtin_amdgcn_image_store_cube_v4f32_i32:
1086 case AMDGPU::BI__builtin_amdgcn_image_store_cube_v4f16_i32:
1088 *this, E, Intrinsic::amdgcn_image_store_cube, true);
1089 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1d_v4f32_i32:
1090 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1d_v4f16_i32:
1092 *this, E, Intrinsic::amdgcn_image_store_mip_1d, true);
1093 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1darray_v4f32_i32:
1094 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1darray_v4f16_i32:
1096 *this, E, Intrinsic::amdgcn_image_store_mip_1darray, true);
1097 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_f32_i32:
1098 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_v4f32_i32:
1099 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_v4f16_i32:
1101 *this, E, Intrinsic::amdgcn_image_store_mip_2d, true);
1102 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_f32_i32:
1103 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_v4f32_i32:
1104 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_v4f16_i32:
1106 *this, E, Intrinsic::amdgcn_image_store_mip_2darray, true);
1107 case AMDGPU::BI__builtin_amdgcn_image_store_mip_3d_v4f32_i32:
1108 case AMDGPU::BI__builtin_amdgcn_image_store_mip_3d_v4f16_i32:
1110 *this, E, Intrinsic::amdgcn_image_store_mip_3d, true);
1111 case AMDGPU::BI__builtin_amdgcn_image_store_mip_cube_v4f32_i32:
1112 case AMDGPU::BI__builtin_amdgcn_image_store_mip_cube_v4f16_i32:
1114 *this, E, Intrinsic::amdgcn_image_store_mip_cube, true);
1115 case AMDGPU::BI__builtin_amdgcn_image_sample_1d_v4f32_f32:
1116 case AMDGPU::BI__builtin_amdgcn_image_sample_1d_v4f16_f32:
1118 *this, E, Intrinsic::amdgcn_image_sample_1d, false);
1119 case AMDGPU::BI__builtin_amdgcn_image_sample_1darray_v4f32_f32:
1120 case AMDGPU::BI__builtin_amdgcn_image_sample_1darray_v4f16_f32:
1122 *this, E, Intrinsic::amdgcn_image_sample_1darray, false);
1123 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_f32_f32:
1124 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_v4f32_f32:
1125 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_v4f16_f32:
1127 *this, E, Intrinsic::amdgcn_image_sample_2d, false);
1128 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_f32_f32:
1129 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_v4f32_f32:
1130 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_v4f16_f32:
1132 *this, E, Intrinsic::amdgcn_image_sample_2darray, false);
1133 case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f32_f32:
1134 case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f16_f32:
1136 *this, E, Intrinsic::amdgcn_image_sample_3d, false);
1137 case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f32_f32:
1138 case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f16_f32:
1140 *this, E, Intrinsic::amdgcn_image_sample_cube, false);
1141 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1142 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1143 llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8);
1144 Function *F = CGM.getIntrinsic(
1145 BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
1146 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
1147 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
1148 {VT, VT});
1149
1151 for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
1152 Args.push_back(EmitScalarExpr(E->getArg(I)));
1153 return Builder.CreateCall(F, Args);
1154 }
1155 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1156 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1157 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
1158 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
1159 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1160 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1161 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
1162 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
1163 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1164 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1165 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
1166 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
1167 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1168 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1169 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1170 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1171 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
1172 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
1173 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
1174 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
1175 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
1176 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
1177 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
1178 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
1179 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1180 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
1181 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1182 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
1183 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1184 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
1185 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1186 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
1187 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1188 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
1189 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1190 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
1191 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1192 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
1193 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1194 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
1195 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1196 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
1197 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1198 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
1199 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1200 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1201 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1202 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1203 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1204 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1205 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1206 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1207 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1208 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1209 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1210 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1211 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1212 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1213 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1214 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1215 // GFX1250 WMMA builtins
1216 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1217 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1218 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1219 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1220 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1221 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1222 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1223 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1224 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1225 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1226 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1227 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1228 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1229 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1230 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1231 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1232 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1233 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1234 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1235 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1236 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1237 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1238 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1239 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1240 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1241 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1242 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1243 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1244 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1245 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1246 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1247 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1248 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1249 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1250 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1251 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1252 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1253 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1254 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1255 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1256 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1257 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1258 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
1259
1260 // These operations perform a matrix multiplication and accumulation of
1261 // the form:
1262 // D = A * B + C
1263 // We need to specify one type for matrices AB and one for matrices CD.
1264 // Sparse matrix operations can have different types for A and B as well as
1265 // an additional type for sparsity index.
1266 // Destination type should be put before types used for source operands.
1267 SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
1268 // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
1269 // There is no need for the variable opsel argument, so always set it to
1270 // "false".
1271 bool AppendFalseForOpselArg = false;
1272 unsigned BuiltinWMMAOp;
1273 // Need return type when D and C are of different types.
1274 bool NeedReturnType = false;
1275
1276 switch (BuiltinID) {
1277 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
1278 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
1279 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
1280 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
1281 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1282 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
1283 break;
1284 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1285 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1286 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
1287 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
1288 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1289 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
1290 break;
1291 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
1292 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
1293 AppendFalseForOpselArg = true;
1294 [[fallthrough]];
1295 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1296 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
1297 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1298 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
1299 break;
1300 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
1301 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
1302 AppendFalseForOpselArg = true;
1303 [[fallthrough]];
1304 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1305 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
1306 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1307 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
1308 break;
1309 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1310 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
1311 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1312 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
1313 break;
1314 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1315 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
1316 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1317 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
1318 break;
1319 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1320 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1321 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1322 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
1323 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1324 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
1325 break;
1326 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1327 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1328 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1329 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
1330 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1331 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
1332 break;
1333 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1334 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
1335 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1336 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
1337 break;
1338 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1339 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
1340 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1341 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
1342 break;
1343 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1344 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
1345 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1346 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
1347 break;
1348 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1349 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
1350 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1351 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
1352 break;
1353 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1354 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
1355 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1356 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
1357 break;
1358 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1359 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
1360 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1361 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
1362 break;
1363 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1364 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
1365 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1366 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
1367 break;
1368 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1369 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
1370 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1371 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
1372 break;
1373 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1374 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1375 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1376 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
1377 break;
1378 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1379 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1380 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1381 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
1382 break;
1383 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1384 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1385 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1386 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
1387 break;
1388 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1389 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1390 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1391 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
1392 break;
1393 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1394 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1395 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1396 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
1397 break;
1398 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1399 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1400 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1401 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
1402 break;
1403 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1404 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1405 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1406 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
1407 break;
1408 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1409 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1410 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1411 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
1412 break;
1413 // GFX1250 WMMA builtins
1414 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1415 ArgsForMatchingMatrixTypes = {5, 1};
1416 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
1417 break;
1418 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1419 ArgsForMatchingMatrixTypes = {5, 1};
1420 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
1421 break;
1422 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1423 ArgsForMatchingMatrixTypes = {5, 1};
1424 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
1425 break;
1426 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1427 ArgsForMatchingMatrixTypes = {5, 1};
1428 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
1429 break;
1430 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1431 ArgsForMatchingMatrixTypes = {5, 1};
1432 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
1433 break;
1434 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1435 NeedReturnType = true;
1436 ArgsForMatchingMatrixTypes = {1, 5};
1437 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
1438 break;
1439 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1440 ArgsForMatchingMatrixTypes = {3, 0};
1441 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
1442 break;
1443 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1444 ArgsForMatchingMatrixTypes = {3, 0};
1445 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
1446 break;
1447 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1448 ArgsForMatchingMatrixTypes = {3, 0};
1449 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
1450 break;
1451 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1452 ArgsForMatchingMatrixTypes = {3, 0};
1453 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
1454 break;
1455 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1456 ArgsForMatchingMatrixTypes = {3, 0};
1457 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
1458 break;
1459 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1460 ArgsForMatchingMatrixTypes = {3, 0};
1461 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
1462 break;
1463 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1464 ArgsForMatchingMatrixTypes = {3, 0};
1465 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
1466 break;
1467 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1468 ArgsForMatchingMatrixTypes = {3, 0};
1469 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
1470 break;
1471 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1472 ArgsForMatchingMatrixTypes = {3, 0};
1473 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
1474 break;
1475 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1476 ArgsForMatchingMatrixTypes = {3, 0};
1477 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
1478 break;
1479 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1480 ArgsForMatchingMatrixTypes = {3, 0};
1481 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
1482 break;
1483 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1484 ArgsForMatchingMatrixTypes = {3, 0};
1485 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
1486 break;
1487 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1488 ArgsForMatchingMatrixTypes = {3, 0};
1489 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
1490 break;
1491 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1492 ArgsForMatchingMatrixTypes = {3, 0};
1493 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
1494 break;
1495 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1496 ArgsForMatchingMatrixTypes = {3, 0};
1497 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
1498 break;
1499 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1500 ArgsForMatchingMatrixTypes = {3, 0};
1501 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
1502 break;
1503 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1504 ArgsForMatchingMatrixTypes = {4, 1};
1505 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
1506 break;
1507 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1508 ArgsForMatchingMatrixTypes = {5, 1, 3};
1509 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4;
1510 break;
1511 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1512 ArgsForMatchingMatrixTypes = {5, 1, 3};
1513 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4;
1514 break;
1515 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1516 ArgsForMatchingMatrixTypes = {5, 1, 3};
1517 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4;
1518 break;
1519 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1520 ArgsForMatchingMatrixTypes = {3, 0, 1};
1521 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
1522 break;
1523 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1524 ArgsForMatchingMatrixTypes = {3, 0, 1};
1525 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4;
1526 break;
1527 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1528 ArgsForMatchingMatrixTypes = {3, 0, 1};
1529 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4;
1530 break;
1531 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1532 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1533 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
1534 break;
1535 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1536 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1537 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
1538 break;
1539 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1540 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1541 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
1542 break;
1543 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1544 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1545 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
1546 break;
1547 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1548 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1549 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
1550 break;
1551 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1552 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1553 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
1554 break;
1555 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1556 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1557 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
1558 break;
1559 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1560 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1561 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
1562 break;
1563 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1564 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1565 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
1566 break;
1567 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1568 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1569 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
1570 break;
1571 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1572 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1573 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
1574 break;
1575 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1576 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1577 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
1578 break;
1579 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1580 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1581 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
1582 break;
1583 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
1584 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1585 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
1586 break;
1587 }
1588
1590 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
1591 Args.push_back(EmitScalarExpr(E->getArg(i)));
1592 if (AppendFalseForOpselArg)
1593 Args.push_back(Builder.getFalse());
1594
1596 if (NeedReturnType)
1597 ArgTypes.push_back(ConvertType(E->getType()));
1598 for (auto ArgIdx : ArgsForMatchingMatrixTypes)
1599 ArgTypes.push_back(Args[ArgIdx]->getType());
1600
1601 Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
1602 return Builder.CreateCall(F, Args);
1603 }
1604 // amdgcn workgroup size
1605 case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
1606 return EmitAMDGPUWorkGroupSize(*this, 0);
1607 case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
1608 return EmitAMDGPUWorkGroupSize(*this, 1);
1609 case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
1610 return EmitAMDGPUWorkGroupSize(*this, 2);
1611
1612 // amdgcn grid size
1613 case AMDGPU::BI__builtin_amdgcn_grid_size_x:
1614 return EmitAMDGPUGridSize(*this, 0);
1615 case AMDGPU::BI__builtin_amdgcn_grid_size_y:
1616 return EmitAMDGPUGridSize(*this, 1);
1617 case AMDGPU::BI__builtin_amdgcn_grid_size_z:
1618 return EmitAMDGPUGridSize(*this, 2);
1619
1620 // r600 intrinsics
1621 case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
1622 case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
1624 Intrinsic::r600_recipsqrt_ieee);
1625 case AMDGPU::BI__builtin_amdgcn_alignbit: {
1626 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
1627 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
1628 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
1629 Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
1630 return Builder.CreateCall(F, { Src0, Src1, Src2 });
1631 }
1632 case AMDGPU::BI__builtin_amdgcn_fence: {
1634 EmitScalarExpr(E->getArg(1)), AO, SSID);
1635 FenceInst *Fence = Builder.CreateFence(AO, SSID);
1636 if (E->getNumArgs() > 2)
1638 return Fence;
1639 }
1640 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1641 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1642 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1643 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1644 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1645 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1646 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1647 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1648 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1649 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1650 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1651 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1652 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1653 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1654 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1655 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1656 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1657 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1658 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1659 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1660 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1661 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1662 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1663 llvm::AtomicRMWInst::BinOp BinOp;
1664 switch (BuiltinID) {
1665 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1666 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1667 BinOp = llvm::AtomicRMWInst::UIncWrap;
1668 break;
1669 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1670 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1671 BinOp = llvm::AtomicRMWInst::UDecWrap;
1672 break;
1673 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1674 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1675 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1676 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1677 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1678 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1679 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1680 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1681 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1682 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1683 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1684 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1685 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1686 BinOp = llvm::AtomicRMWInst::FAdd;
1687 break;
1688 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1689 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1690 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1691 BinOp = llvm::AtomicRMWInst::FMin;
1692 break;
1693 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1694 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1695 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1696 BinOp = llvm::AtomicRMWInst::FMax;
1697 break;
1698 }
1699
1700 Address Ptr = CheckAtomicAlignment(*this, E);
1701 Value *Val = EmitScalarExpr(E->getArg(1));
1702 llvm::Type *OrigTy = Val->getType();
1703 QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
1704
1705 bool Volatile;
1706
1707 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
1708 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
1709 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1710 // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1711 Volatile =
1712 cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
1713 } else {
1714 // Infer volatile from the passed type.
1715 Volatile =
1716 PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1717 }
1718
1719 if (E->getNumArgs() >= 4) {
1720 // Some of the builtins have explicit ordering and scope arguments.
1722 EmitScalarExpr(E->getArg(3)), AO, SSID);
1723 } else {
1724 // Most of the builtins do not have syncscope/order arguments. For DS
1725 // atomics the scope doesn't really matter, as they implicitly operate at
1726 // workgroup scope.
1727 //
1728 // The global/flat cases need to use agent scope to consistently produce
1729 // the native instruction instead of a cmpxchg expansion.
1730 if (getTarget().getTriple().isSPIRV())
1731 SSID = getLLVMContext().getOrInsertSyncScopeID("device");
1732 else
1733 SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1734 AO = AtomicOrdering::Monotonic;
1735
1736 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
1737 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
1738 BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
1739 BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1740 llvm::Type *V2BF16Ty = FixedVectorType::get(
1741 llvm::Type::getBFloatTy(Builder.getContext()), 2);
1742 Val = Builder.CreateBitCast(Val, V2BF16Ty);
1743 }
1744 }
1745
1746 llvm::AtomicRMWInst *RMW =
1747 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1748 if (Volatile)
1749 RMW->setVolatile(true);
1750
1751 unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1752 if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1753 // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1754 // instruction for flat and global operations.
1755 llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
1756 RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
1757
1758 // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1759 // instruction, but this only matters for float fadd.
1760 if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1761 RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
1762 }
1763
1764 return Builder.CreateBitCast(RMW, OrigTy);
1765 }
1766 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1767 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1768 llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
1769 llvm::Type *ResultType = ConvertType(E->getType());
1770 // s_sendmsg_rtn is mangled using return type only.
1771 Function *F =
1772 CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
1773 return Builder.CreateCall(F, {Arg});
1774 }
1775 case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1776 case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1777 // Because builtin types are limited, and the intrinsic uses a struct/pair
1778 // output, marshal the pair-of-i32 to <2 x i32>.
1779 Value *VDstOld = EmitScalarExpr(E->getArg(0));
1780 Value *VSrcOld = EmitScalarExpr(E->getArg(1));
1781 Value *FI = EmitScalarExpr(E->getArg(2));
1782 Value *BoundCtrl = EmitScalarExpr(E->getArg(3));
1783 Function *F =
1784 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1785 ? Intrinsic::amdgcn_permlane16_swap
1786 : Intrinsic::amdgcn_permlane32_swap);
1787 llvm::CallInst *Call =
1788 Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl});
1789
1790 llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0);
1791 llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1);
1792
1793 llvm::Type *ResultType = ConvertType(E->getType());
1794
1795 llvm::Value *Insert0 = Builder.CreateInsertElement(
1796 llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0));
1797 llvm::Value *AsVector =
1798 Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1));
1799 return AsVector;
1800 }
1801 case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1802 case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1804 Intrinsic::amdgcn_bitop3);
1805 case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1806 // TODO: LLVM has this overloaded to allow for fat pointers, but since
1807 // those haven't been plumbed through to Clang yet, default to creating the
1808 // resource type.
1810 for (unsigned I = 0; I < 4; ++I)
1811 Args.push_back(EmitScalarExpr(E->getArg(I)));
1812 llvm::PointerType *RetTy = llvm::PointerType::get(
1813 Builder.getContext(), llvm::AMDGPUAS::BUFFER_RESOURCE);
1814 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc,
1815 {RetTy, Args[0]->getType()});
1816 return Builder.CreateCall(F, Args);
1817 }
1818 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1819 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1820 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1821 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1822 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1823 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1825 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store);
1826 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1827 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1828 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1829 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1830 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1831 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1832 llvm::Type *RetTy = nullptr;
1833 switch (BuiltinID) {
1834 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1835 RetTy = Int8Ty;
1836 break;
1837 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1838 RetTy = Int16Ty;
1839 break;
1840 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1841 RetTy = Int32Ty;
1842 break;
1843 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1844 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/2);
1845 break;
1846 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1847 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/3);
1848 break;
1849 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1850 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/4);
1851 break;
1852 }
1853 Function *F =
1854 CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy);
1855 return Builder.CreateCall(
1856 F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
1857 EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
1858 }
1859 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32:
1861 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_add);
1862 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32:
1863 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16:
1865 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd);
1866 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32:
1867 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f64:
1869 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin);
1870 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32:
1871 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f64:
1873 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax);
1874 case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1876 *this, E, Intrinsic::amdgcn_s_prefetch_data);
1877 case Builtin::BIlogbf:
1878 case Builtin::BI__builtin_logbf: {
1879 Value *Src0 = EmitScalarExpr(E->getArg(0));
1880 Function *FrExpFunc = CGM.getIntrinsic(
1881 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1882 CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0);
1883 Value *Exp = Builder.CreateExtractValue(FrExp, 1);
1884 Value *Add = Builder.CreateAdd(
1885 Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true);
1886 Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getFloatTy());
1887 Value *Fabs =
1888 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1889 Value *FCmpONE = Builder.CreateFCmpONE(
1890 Fabs, ConstantFP::getInfinity(Builder.getFloatTy()));
1891 Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs);
1892 Value *FCmpOEQ =
1893 Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getFloatTy()));
1894 Value *Sel2 = Builder.CreateSelect(
1895 FCmpOEQ,
1896 ConstantFP::getInfinity(Builder.getFloatTy(), /*Negative=*/true), Sel1);
1897 return Sel2;
1898 }
1899 case Builtin::BIlogb:
1900 case Builtin::BI__builtin_logb: {
1901 Value *Src0 = EmitScalarExpr(E->getArg(0));
1902 Function *FrExpFunc = CGM.getIntrinsic(
1903 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1904 CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0);
1905 Value *Exp = Builder.CreateExtractValue(FrExp, 1);
1906 Value *Add = Builder.CreateAdd(
1907 Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true);
1908 Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getDoubleTy());
1909 Value *Fabs =
1910 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1911 Value *FCmpONE = Builder.CreateFCmpONE(
1912 Fabs, ConstantFP::getInfinity(Builder.getDoubleTy()));
1913 Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs);
1914 Value *FCmpOEQ =
1915 Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getDoubleTy()));
1916 Value *Sel2 = Builder.CreateSelect(
1917 FCmpOEQ,
1918 ConstantFP::getInfinity(Builder.getDoubleTy(), /*Negative=*/true),
1919 Sel1);
1920 return Sel2;
1921 }
1922 case Builtin::BIscalbnf:
1923 case Builtin::BI__builtin_scalbnf:
1924 case Builtin::BIscalbn:
1925 case Builtin::BI__builtin_scalbn:
1927 *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp);
1928 default:
1929 return nullptr;
1930 }
1931}
#define V(N, I)
static Value * emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E, Intrinsic::ID IntrinsicID, Intrinsic::ID ConstrainedIntrinsicID)
Address CheckAtomicAlignment(CodeGenFunction &CGF, const CallExpr *E)
llvm::Value * emitBuiltinWithOneOverloadedType(clang::CodeGen::CodeGenFunction &CGF, const clang::CallExpr *E, unsigned IntrinsicID, llvm::StringRef Name="")
Definition CGBuiltin.h:63
static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID)
Definition AMDGPU.cpp:385
static StringRef mapScopeToSPIRV(StringRef AMDGCNScope)
Definition AMDGPU.cpp:266
static llvm::Value * loadTextureDescPtorAsVec8I32(CodeGenFunction &CGF, llvm::Value *RsrcPtr)
Definition AMDGPU.cpp:187
static Value * EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E, llvm::Type *RegisterType, llvm::Type *ValueType, bool isExecHi)
Definition AMDGPU.cpp:169
llvm::CallInst * emitAMDGCNImageOverloadedReturnType(clang::CodeGen::CodeGenFunction &CGF, const clang::CallExpr *E, unsigned IntrinsicID, bool IsImageStore)
Definition AMDGPU.cpp:217
static Value * emitFPIntBuiltin(CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID)
Definition AMDGPU.cpp:256
TokenType getType() const
Returns the token's type, e.g.
#define X(type, name)
Definition Value.h:97
HLSLResourceBindingAttr::RegisterType RegisterType
Definition SemaHLSL.cpp:57
static QualType getPointeeType(const MemRegion *R)
Enumerates target-specific builtins in their own namespaces within namespace clang.
QualType GetBuiltinType(unsigned ID, GetBuiltinTypeError &Error, unsigned *IntegerConstantArgs=nullptr) const
Return the type for the specified builtin.
unsigned getTargetAddressSpace(LangAS AS) const
@ GE_None
No error.
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition Expr.h:2877
Expr * getArg(unsigned Arg)
getArg - Return the specified argument.
Definition Expr.h:3081
unsigned getNumArgs() const
getNumArgs - Return the number of actual arguments to this call.
Definition Expr.h:3068
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition CharUnits.h:63
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition Address.h:128
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition Address.h:209
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition Address.h:204
Address CreateGEP(CodeGenFunction &CGF, Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition CGBuilder.h:296
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition CGBuilder.h:112
llvm::LoadInst * CreateAlignedLoad(llvm::Type *Ty, llvm::Value *Addr, CharUnits Align, const llvm::Twine &Name="")
Definition CGBuilder.h:132
Address CreateAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition CGBuilder.h:193
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Value * EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx, const CallExpr *E)
Definition AMDGPU.cpp:345
llvm::Type * ConvertType(QualType T)
llvm::Value * EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E)
Definition AMDGPU.cpp:419
const TargetInfo & getTarget() const
void AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, const CallExpr *E)
Definition AMDGPU.cpp:362
Address EmitPointerWithAlignment(const Expr *Addr, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitPointerWithAlignment - Given an expression with a pointer type, emit the value and compute our be...
Definition CGExpr.cpp:1706
llvm::Value * EmitScalarExpr(const Expr *E, bool IgnoreResultAssign=false)
EmitScalarExpr - Emit the computation of the specified expression of LLVM scalar type,...
llvm::LLVMContext & getLLVMContext()
void ProcessOrderScopeAMDGCN(llvm::Value *Order, llvm::Value *Scope, llvm::AtomicOrdering &AO, llvm::SyncScope::ID &SSID)
Definition AMDGPU.cpp:280
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
const llvm::DataLayout & getDataLayout() const
ASTContext & getContext() const
llvm::Function * getIntrinsic(unsigned IID, ArrayRef< llvm::Type * > Tys={})
Expr * IgnoreParenCasts() LLVM_READONLY
Skip past any parentheses and casts which might surround this expression until reaching a fixed point...
Definition Expr.cpp:3090
std::optional< llvm::APSInt > getIntegerConstantExpr(const ASTContext &Ctx) const
isIntegerConstantExpr - Return the value if this expression is a valid integer constant expression.
Expr * IgnoreImpCasts() LLVM_READONLY
Skip past any implicit casts which might surround this expression until reaching a fixed point.
Definition Expr.cpp:3065
SourceLocation getExprLoc() const LLVM_READONLY
getExprLoc - Return the preferred location for the arrow when diagnosing a problem with a generic exp...
Definition Expr.cpp:273
QualType getType() const
Definition Expr.h:144
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition TypeBase.h:3328
A (possibly-)qualified type.
Definition TypeBase.h:937
bool isVolatileQualified() const
Determine whether this type is volatile-qualified.
Definition TypeBase.h:8362
QualType getCanonicalType() const
Definition TypeBase.h:8330
Scope - A scope is a transient data structure that is used while parsing the program.
Definition Scope.h:41
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition TargetInfo.h:323
unsigned getMaxOpenCLWorkGroupSize() const
Definition TargetInfo.h:870
llvm::CodeObjectVersionKind CodeObjectVersion
Code object version for AMDGPU.
QualType getType() const
Definition Value.cpp:237
The JSON file list parser is used to communicate input to InstallAPI.
@ Result
The result type of a method or function.
Definition TypeBase.h:905
U cast(CodeGen::Address addr)
Definition Address.h:327
Diagnostic wrappers for TextAPI types for error reporting.
Definition Dominators.h:30
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64