clang 22.0.0git
AMDGPU.cpp
Go to the documentation of this file.
1//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
15#include "llvm/Analysis/ValueTracking.h"
16#include "llvm/IR/IntrinsicsAMDGPU.h"
17#include "llvm/IR/IntrinsicsR600.h"
18#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
19#include "llvm/Support/AMDGPUAddrSpace.h"
20
21using namespace clang;
22using namespace CodeGen;
23using namespace llvm;
24
25namespace {
26
27// Has second type mangled argument.
28static Value *
30 Intrinsic::ID IntrinsicID,
31 Intrinsic::ID ConstrainedIntrinsicID) {
32 llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
33 llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
34
35 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
36 if (CGF.Builder.getIsFPConstrained()) {
37 Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
38 {Src0->getType(), Src1->getType()});
39 return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1});
40 }
41
42 Function *F =
43 CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()});
44 return CGF.Builder.CreateCall(F, {Src0, Src1});
45}
46
47// If \p E is not null pointer, insert address space cast to match return
48// type of \p E if necessary.
49Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
50 const CallExpr *E = nullptr) {
51 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
52 auto *Call = CGF.Builder.CreateCall(F);
53 Call->addRetAttr(
54 Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
55 Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
56 if (!E)
57 return Call;
58 QualType BuiltinRetType = E->getType();
59 auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
60 if (RetTy == Call->getType())
61 return Call;
62 return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
63}
64
65Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
66 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
67 auto *Call = CGF.Builder.CreateCall(F);
68 Call->addRetAttr(
69 Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
70 Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
71 return Call;
72}
73
74// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
75/// Emit code based on Code Object ABI version.
76/// COV_4 : Emit code to use dispatch ptr
77/// COV_5+ : Emit code to use implicitarg ptr
78/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
79/// and use its value for COV_4 or COV_5+ approach. It is used for
80/// compiling device libraries in an ABI-agnostic way.
81Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
82 llvm::LoadInst *LD;
83
84 auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
85
86 if (Cov == CodeObjectVersionKind::COV_None) {
87 StringRef Name = "__oclc_ABI_version";
88 auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
89 if (!ABIVersionC)
90 ABIVersionC = new llvm::GlobalVariable(
91 CGF.CGM.getModule(), CGF.Int32Ty, false,
92 llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
93 llvm::GlobalVariable::NotThreadLocal,
95
96 // This load will be eliminated by the IPSCCP because it is constant
97 // weak_odr without externally_initialized. Either changing it to weak or
98 // adding externally_initialized will keep the load.
99 Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
100 CGF.CGM.getIntAlign());
101
102 Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
103 ABIVersion,
104 llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5));
105
106 // Indexing the implicit kernarg segment.
107 Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
108 CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
109
110 // Indexing the HSA kernel_dispatch_packet struct.
111 Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
112 CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
113
114 auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
115 LD = CGF.Builder.CreateLoad(
116 Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
117 } else {
118 Value *GEP = nullptr;
119 if (Cov >= CodeObjectVersionKind::COV_5) {
120 // Indexing the implicit kernarg segment.
121 GEP = CGF.Builder.CreateConstGEP1_32(
122 CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
123 } else {
124 // Indexing the HSA kernel_dispatch_packet struct.
125 GEP = CGF.Builder.CreateConstGEP1_32(
126 CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
127 }
128 LD = CGF.Builder.CreateLoad(
130 }
131
132 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
133 llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
134 APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
135 LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
136 LD->setMetadata(llvm::LLVMContext::MD_noundef,
137 llvm::MDNode::get(CGF.getLLVMContext(), {}));
138 LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
139 llvm::MDNode::get(CGF.getLLVMContext(), {}));
140 return LD;
141}
142
143// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
144Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
145 const unsigned XOffset = 12;
146 auto *DP = EmitAMDGPUDispatchPtr(CGF);
147 // Indexing the HSA kernel_dispatch_packet struct.
148 auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
149 auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
150 auto *LD = CGF.Builder.CreateLoad(
152
153 llvm::MDBuilder MDB(CGF.getLLVMContext());
154
155 // Known non-zero.
156 LD->setMetadata(llvm::LLVMContext::MD_range,
157 MDB.createRange(APInt(32, 1), APInt::getZero(32)));
158 LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
159 llvm::MDNode::get(CGF.getLLVMContext(), {}));
160 return LD;
161}
162} // namespace
163
164// Generates the IR for __builtin_read_exec_*.
165// Lowers the builtin to amdgcn_ballot intrinsic.
167 llvm::Type *RegisterType,
168 llvm::Type *ValueType, bool isExecHi) {
169 CodeGen::CGBuilderTy &Builder = CGF.Builder;
170 CodeGen::CodeGenModule &CGM = CGF.CGM;
171
172 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType});
173 llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)});
174
175 if (isExecHi) {
176 Value *Rt2 = Builder.CreateLShr(Call, 32);
177 Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty);
178 return Rt2;
179 }
180
181 return Call;
182}
183
184// Emit an intrinsic that has 1 float or double operand, and 1 integer.
186 const CallExpr *E,
187 unsigned IntrinsicID) {
188 llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
189 llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
190
191 Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
192 return CGF.Builder.CreateCall(F, {Src0, Src1});
193}
194
195static inline StringRef mapScopeToSPIRV(StringRef AMDGCNScope) {
196 if (AMDGCNScope == "agent")
197 return "device";
198 if (AMDGCNScope == "wavefront")
199 return "subgroup";
200 return AMDGCNScope;
201}
202
203// For processing memory ordering and memory scope arguments of various
204// amdgcn builtins.
205// \p Order takes a C++11 compatible memory-ordering specifier and converts
206// it into LLVM's memory ordering specifier using atomic C ABI, and writes
207// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
208// specific SyncScopeID and writes it to \p SSID.
210 llvm::AtomicOrdering &AO,
211 llvm::SyncScope::ID &SSID) {
212 int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
213
214 // Map C11/C++11 memory ordering to LLVM memory ordering
215 assert(llvm::isValidAtomicOrderingCABI(ord));
216 switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
217 case llvm::AtomicOrderingCABI::acquire:
218 case llvm::AtomicOrderingCABI::consume:
219 AO = llvm::AtomicOrdering::Acquire;
220 break;
221 case llvm::AtomicOrderingCABI::release:
222 AO = llvm::AtomicOrdering::Release;
223 break;
224 case llvm::AtomicOrderingCABI::acq_rel:
225 AO = llvm::AtomicOrdering::AcquireRelease;
226 break;
227 case llvm::AtomicOrderingCABI::seq_cst:
228 AO = llvm::AtomicOrdering::SequentiallyConsistent;
229 break;
230 case llvm::AtomicOrderingCABI::relaxed:
231 AO = llvm::AtomicOrdering::Monotonic;
232 break;
233 }
234
235 // Some of the atomic builtins take the scope as a string name.
236 StringRef scp;
237 if (llvm::getConstantStringInfo(Scope, scp)) {
238 if (getTarget().getTriple().isSPIRV())
239 scp = mapScopeToSPIRV(scp);
240 SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
241 return;
242 }
243
244 // Older builtins had an enum argument for the memory scope.
245 int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
246 switch (scope) {
247 case 0: // __MEMORY_SCOPE_SYSTEM
248 SSID = llvm::SyncScope::System;
249 break;
250 case 1: // __MEMORY_SCOPE_DEVICE
251 if (getTarget().getTriple().isSPIRV())
252 SSID = getLLVMContext().getOrInsertSyncScopeID("device");
253 else
254 SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
255 break;
256 case 2: // __MEMORY_SCOPE_WRKGRP
257 SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
258 break;
259 case 3: // __MEMORY_SCOPE_WVFRNT
260 if (getTarget().getTriple().isSPIRV())
261 SSID = getLLVMContext().getOrInsertSyncScopeID("subgroup");
262 else
263 SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
264 break;
265 case 4: // __MEMORY_SCOPE_SINGLE
266 SSID = llvm::SyncScope::SingleThread;
267 break;
268 default:
269 SSID = llvm::SyncScope::System;
270 break;
271 }
272}
273
274llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
275 unsigned Idx,
276 const CallExpr *E) {
277 llvm::Value *Arg = nullptr;
278 if ((ICEArguments & (1 << Idx)) == 0) {
279 Arg = EmitScalarExpr(E->getArg(Idx));
280 } else {
281 // If this is required to be a constant, constant fold it so that we
282 // know that the generated intrinsic gets a ConstantInt.
283 std::optional<llvm::APSInt> Result =
285 assert(Result && "Expected argument to be a constant");
286 Arg = llvm::ConstantInt::get(getLLVMContext(), *Result);
287 }
288 return Arg;
289}
290
292 const CallExpr *E) {
293 constexpr const char *Tag = "amdgpu-synchronize-as";
294
295 LLVMContext &Ctx = Inst->getContext();
297 for (unsigned K = 2; K < E->getNumArgs(); ++K) {
298 llvm::Value *V = EmitScalarExpr(E->getArg(K));
299 StringRef AS;
300 if (llvm::getConstantStringInfo(V, AS)) {
301 MMRAs.push_back({Tag, AS});
302 // TODO: Delete the resulting unused constant?
303 continue;
304 }
305 CGM.Error(E->getExprLoc(),
306 "expected an address space name as a string literal");
307 }
308
309 llvm::sort(MMRAs);
310 MMRAs.erase(llvm::unique(MMRAs), MMRAs.end());
311 Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
312}
313
314static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
315 switch (BuiltinID) {
316 default:
317 llvm_unreachable("Unknown BuiltinID for wave reduction");
318 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
319 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
320 return Intrinsic::amdgcn_wave_reduce_add;
321 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
322 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
323 return Intrinsic::amdgcn_wave_reduce_sub;
324 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
325 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
326 return Intrinsic::amdgcn_wave_reduce_min;
327 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
328 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
329 return Intrinsic::amdgcn_wave_reduce_umin;
330 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
331 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
332 return Intrinsic::amdgcn_wave_reduce_max;
333 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
334 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
335 return Intrinsic::amdgcn_wave_reduce_umax;
336 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
337 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
338 return Intrinsic::amdgcn_wave_reduce_and;
339 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
340 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
341 return Intrinsic::amdgcn_wave_reduce_or;
342 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
343 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64:
344 return Intrinsic::amdgcn_wave_reduce_xor;
345 }
346}
347
349 const CallExpr *E) {
350 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
351 llvm::SyncScope::ID SSID;
352 switch (BuiltinID) {
353 case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
354 case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
355 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
356 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
357 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
358 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
359 case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
360 case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
361 case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
362 case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
363 case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
364 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
365 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
366 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
367 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
368 case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
369 case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
370 case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: {
371 Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
372 llvm::Value *Value = EmitScalarExpr(E->getArg(0));
373 llvm::Value *Strategy = EmitScalarExpr(E->getArg(1));
374 llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
375 return Builder.CreateCall(F, {Value, Strategy});
376 }
377 case AMDGPU::BI__builtin_amdgcn_div_scale:
378 case AMDGPU::BI__builtin_amdgcn_div_scalef: {
379 // Translate from the intrinsics's struct return to the builtin's out
380 // argument.
381
382 Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
383
384 llvm::Value *X = EmitScalarExpr(E->getArg(0));
385 llvm::Value *Y = EmitScalarExpr(E->getArg(1));
386 llvm::Value *Z = EmitScalarExpr(E->getArg(2));
387
388 llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
389 X->getType());
390
391 llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
392
393 llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
394 llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
395
396 llvm::Type *RealFlagType = FlagOutPtr.getElementType();
397
398 llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
399 Builder.CreateStore(FlagExt, FlagOutPtr);
400 return Result;
401 }
402 case AMDGPU::BI__builtin_amdgcn_div_fmas:
403 case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
404 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
405 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
406 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
407 llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
408
409 llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
410 Src0->getType());
411 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
412 return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
413 }
414
415 case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
417 Intrinsic::amdgcn_ds_swizzle);
418 case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
419 case AMDGPU::BI__builtin_amdgcn_mov_dpp:
420 case AMDGPU::BI__builtin_amdgcn_update_dpp: {
422 // Find out if any arguments are required to be integer constant
423 // expressions.
424 unsigned ICEArguments = 0;
426 getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
427 assert(Error == ASTContext::GE_None && "Should not codegen an error");
428 llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
429 unsigned Size = DataTy->getPrimitiveSizeInBits();
430 llvm::Type *IntTy =
431 llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
432 Function *F =
433 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
434 ? Intrinsic::amdgcn_mov_dpp8
435 : Intrinsic::amdgcn_update_dpp,
436 IntTy);
437 assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
438 E->getNumArgs() == 2);
439 bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
440 if (InsertOld)
441 Args.push_back(llvm::PoisonValue::get(IntTy));
442 for (unsigned I = 0; I != E->getNumArgs(); ++I) {
443 llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
444 if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
445 Size < 32) {
446 if (!DataTy->isIntegerTy())
447 V = Builder.CreateBitCast(
448 V, llvm::IntegerType::get(Builder.getContext(), Size));
449 V = Builder.CreateZExtOrBitCast(V, IntTy);
450 }
451 llvm::Type *ExpTy =
452 F->getFunctionType()->getFunctionParamType(I + InsertOld);
453 Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy));
454 }
455 Value *V = Builder.CreateCall(F, Args);
456 if (Size < 32 && !DataTy->isIntegerTy())
457 V = Builder.CreateTrunc(
458 V, llvm::IntegerType::get(Builder.getContext(), Size));
459 return Builder.CreateTruncOrBitCast(V, DataTy);
460 }
461 case AMDGPU::BI__builtin_amdgcn_permlane16:
462 case AMDGPU::BI__builtin_amdgcn_permlanex16:
464 *this, E,
465 BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
466 ? Intrinsic::amdgcn_permlane16
467 : Intrinsic::amdgcn_permlanex16);
468 case AMDGPU::BI__builtin_amdgcn_permlane64:
470 Intrinsic::amdgcn_permlane64);
471 case AMDGPU::BI__builtin_amdgcn_readlane:
473 Intrinsic::amdgcn_readlane);
474 case AMDGPU::BI__builtin_amdgcn_readfirstlane:
476 Intrinsic::amdgcn_readfirstlane);
477 case AMDGPU::BI__builtin_amdgcn_div_fixup:
478 case AMDGPU::BI__builtin_amdgcn_div_fixupf:
479 case AMDGPU::BI__builtin_amdgcn_div_fixuph:
481 Intrinsic::amdgcn_div_fixup);
482 case AMDGPU::BI__builtin_amdgcn_trig_preop:
483 case AMDGPU::BI__builtin_amdgcn_trig_preopf:
484 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
485 case AMDGPU::BI__builtin_amdgcn_rcp:
486 case AMDGPU::BI__builtin_amdgcn_rcpf:
487 case AMDGPU::BI__builtin_amdgcn_rcph:
488 case AMDGPU::BI__builtin_amdgcn_rcp_bf16:
489 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rcp);
490 case AMDGPU::BI__builtin_amdgcn_sqrt:
491 case AMDGPU::BI__builtin_amdgcn_sqrtf:
492 case AMDGPU::BI__builtin_amdgcn_sqrth:
493 case AMDGPU::BI__builtin_amdgcn_sqrt_bf16:
495 Intrinsic::amdgcn_sqrt);
496 case AMDGPU::BI__builtin_amdgcn_rsq:
497 case AMDGPU::BI__builtin_amdgcn_rsqf:
498 case AMDGPU::BI__builtin_amdgcn_rsqh:
499 case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
500 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq);
501 case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
502 case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
504 Intrinsic::amdgcn_rsq_clamp);
505 case AMDGPU::BI__builtin_amdgcn_sinf:
506 case AMDGPU::BI__builtin_amdgcn_sinh:
507 case AMDGPU::BI__builtin_amdgcn_sin_bf16:
508 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin);
509 case AMDGPU::BI__builtin_amdgcn_cosf:
510 case AMDGPU::BI__builtin_amdgcn_cosh:
511 case AMDGPU::BI__builtin_amdgcn_cos_bf16:
512 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos);
513 case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
514 return EmitAMDGPUDispatchPtr(*this, E);
515 case AMDGPU::BI__builtin_amdgcn_logf:
516 case AMDGPU::BI__builtin_amdgcn_log_bf16:
517 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
518 case AMDGPU::BI__builtin_amdgcn_exp2f:
519 case AMDGPU::BI__builtin_amdgcn_exp2_bf16:
521 Intrinsic::amdgcn_exp2);
522 case AMDGPU::BI__builtin_amdgcn_log_clampf:
524 Intrinsic::amdgcn_log_clamp);
525 case AMDGPU::BI__builtin_amdgcn_ldexp:
526 case AMDGPU::BI__builtin_amdgcn_ldexpf: {
527 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
528 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
529 llvm::Function *F =
530 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
531 return Builder.CreateCall(F, {Src0, Src1});
532 }
533 case AMDGPU::BI__builtin_amdgcn_ldexph: {
534 // The raw instruction has a different behavior for out of bounds exponent
535 // values (implicit truncation instead of saturate to short_min/short_max).
536 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
537 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
538 llvm::Function *F =
539 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
540 return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)});
541 }
542 case AMDGPU::BI__builtin_amdgcn_frexp_mant:
543 case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
544 case AMDGPU::BI__builtin_amdgcn_frexp_manth:
546 Intrinsic::amdgcn_frexp_mant);
547 case AMDGPU::BI__builtin_amdgcn_frexp_exp:
548 case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
549 Value *Src0 = EmitScalarExpr(E->getArg(0));
550 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
551 { Builder.getInt32Ty(), Src0->getType() });
552 return Builder.CreateCall(F, Src0);
553 }
554 case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
555 Value *Src0 = EmitScalarExpr(E->getArg(0));
556 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
557 { Builder.getInt16Ty(), Src0->getType() });
558 return Builder.CreateCall(F, Src0);
559 }
560 case AMDGPU::BI__builtin_amdgcn_fract:
561 case AMDGPU::BI__builtin_amdgcn_fractf:
562 case AMDGPU::BI__builtin_amdgcn_fracth:
564 Intrinsic::amdgcn_fract);
565 case AMDGPU::BI__builtin_amdgcn_lerp:
567 Intrinsic::amdgcn_lerp);
568 case AMDGPU::BI__builtin_amdgcn_ubfe:
570 Intrinsic::amdgcn_ubfe);
571 case AMDGPU::BI__builtin_amdgcn_sbfe:
573 Intrinsic::amdgcn_sbfe);
574 case AMDGPU::BI__builtin_amdgcn_ballot_w32:
575 case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
576 llvm::Type *ResultType = ConvertType(E->getType());
577 llvm::Value *Src = EmitScalarExpr(E->getArg(0));
578 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
579 return Builder.CreateCall(F, { Src });
580 }
581 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
582 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
583 llvm::Value *Src = EmitScalarExpr(E->getArg(0));
584 Function *F =
585 CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, {Src->getType()});
586 return Builder.CreateCall(F, {Src});
587 }
588 case AMDGPU::BI__builtin_amdgcn_tanhf:
589 case AMDGPU::BI__builtin_amdgcn_tanhh:
590 case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
592 Intrinsic::amdgcn_tanh);
593 case AMDGPU::BI__builtin_amdgcn_uicmp:
594 case AMDGPU::BI__builtin_amdgcn_uicmpl:
595 case AMDGPU::BI__builtin_amdgcn_sicmp:
596 case AMDGPU::BI__builtin_amdgcn_sicmpl: {
597 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
598 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
599 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
600
601 // FIXME-GFX10: How should 32 bit mask be handled?
602 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
603 { Builder.getInt64Ty(), Src0->getType() });
604 return Builder.CreateCall(F, { Src0, Src1, Src2 });
605 }
606 case AMDGPU::BI__builtin_amdgcn_fcmp:
607 case AMDGPU::BI__builtin_amdgcn_fcmpf: {
608 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
609 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
610 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
611
612 // FIXME-GFX10: How should 32 bit mask be handled?
613 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
614 { Builder.getInt64Ty(), Src0->getType() });
615 return Builder.CreateCall(F, { Src0, Src1, Src2 });
616 }
617 case AMDGPU::BI__builtin_amdgcn_class:
618 case AMDGPU::BI__builtin_amdgcn_classf:
619 case AMDGPU::BI__builtin_amdgcn_classh:
620 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
621 case AMDGPU::BI__builtin_amdgcn_fmed3f:
622 case AMDGPU::BI__builtin_amdgcn_fmed3h:
624 Intrinsic::amdgcn_fmed3);
625 case AMDGPU::BI__builtin_amdgcn_ds_append:
626 case AMDGPU::BI__builtin_amdgcn_ds_consume: {
627 Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
628 Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
629 Value *Src0 = EmitScalarExpr(E->getArg(0));
630 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
631 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
632 }
633 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
634 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
635 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
636 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
637 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
638 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
639 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
640 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
641 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
642 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
643 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
644 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
645 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
646 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
647 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
648 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
649 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
650 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
651 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
652 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
653 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
654 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
655 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
656 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
657 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
658 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
659 Intrinsic::ID IID;
660 switch (BuiltinID) {
661 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
662 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
663 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
664 IID = Intrinsic::amdgcn_global_load_tr_b64;
665 break;
666 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
667 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
668 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
669 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
670 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
671 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
672 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
673 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
674 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
675 IID = Intrinsic::amdgcn_global_load_tr_b128;
676 break;
677 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
678 IID = Intrinsic::amdgcn_global_load_tr4_b64;
679 break;
680 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
681 IID = Intrinsic::amdgcn_global_load_tr6_b96;
682 break;
683 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
684 IID = Intrinsic::amdgcn_ds_load_tr4_b64;
685 break;
686 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
687 IID = Intrinsic::amdgcn_ds_load_tr6_b96;
688 break;
689 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
690 IID = Intrinsic::amdgcn_ds_load_tr8_b64;
691 break;
692 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
693 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
694 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
695 IID = Intrinsic::amdgcn_ds_load_tr16_b128;
696 break;
697 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
698 IID = Intrinsic::amdgcn_ds_read_tr4_b64;
699 break;
700 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
701 IID = Intrinsic::amdgcn_ds_read_tr8_b64;
702 break;
703 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
704 IID = Intrinsic::amdgcn_ds_read_tr6_b96;
705 break;
706 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
707 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
708 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
709 IID = Intrinsic::amdgcn_ds_read_tr16_b64;
710 break;
711 }
712 llvm::Type *LoadTy = ConvertType(E->getType());
713 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
714 llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
715 return Builder.CreateCall(F, {Addr});
716 }
717 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
718 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
719 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
720 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
721 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
722 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128: {
723
724 Intrinsic::ID IID;
725 switch (BuiltinID) {
726 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
727 IID = Intrinsic::amdgcn_global_load_monitor_b32;
728 break;
729 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
730 IID = Intrinsic::amdgcn_global_load_monitor_b64;
731 break;
732 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
733 IID = Intrinsic::amdgcn_global_load_monitor_b128;
734 break;
735 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
736 IID = Intrinsic::amdgcn_flat_load_monitor_b32;
737 break;
738 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
739 IID = Intrinsic::amdgcn_flat_load_monitor_b64;
740 break;
741 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128:
742 IID = Intrinsic::amdgcn_flat_load_monitor_b128;
743 break;
744 }
745
746 llvm::Type *LoadTy = ConvertType(E->getType());
747 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
748 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
749 llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
750 return Builder.CreateCall(F, {Addr, Val});
751 }
752 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
753 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
754 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: {
755 Intrinsic::ID IID;
756 switch (BuiltinID) {
757 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
758 IID = Intrinsic::amdgcn_cluster_load_b32;
759 break;
760 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
761 IID = Intrinsic::amdgcn_cluster_load_b64;
762 break;
763 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128:
764 IID = Intrinsic::amdgcn_cluster_load_b128;
765 break;
766 }
768 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
769 Args.push_back(EmitScalarExpr(E->getArg(i)));
770 llvm::Function *F = CGM.getIntrinsic(IID, {ConvertType(E->getType())});
771 return Builder.CreateCall(F, {Args});
772 }
773 case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
774 // Should this have asan instrumentation?
776 Intrinsic::amdgcn_load_to_lds);
777 }
778 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
779 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
780 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
781 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
782 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
783 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B: {
784 Intrinsic::ID IID;
785 switch (BuiltinID) {
786 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
787 IID = Intrinsic::amdgcn_cooperative_atomic_load_32x4B;
788 break;
789 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
790 IID = Intrinsic::amdgcn_cooperative_atomic_store_32x4B;
791 break;
792 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
793 IID = Intrinsic::amdgcn_cooperative_atomic_load_16x8B;
794 break;
795 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
796 IID = Intrinsic::amdgcn_cooperative_atomic_store_16x8B;
797 break;
798 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
799 IID = Intrinsic::amdgcn_cooperative_atomic_load_8x16B;
800 break;
801 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B:
802 IID = Intrinsic::amdgcn_cooperative_atomic_store_8x16B;
803 break;
804 }
805
806 LLVMContext &Ctx = CGM.getLLVMContext();
808 // last argument is a MD string
809 const unsigned ScopeArg = E->getNumArgs() - 1;
810 for (unsigned i = 0; i != ScopeArg; ++i)
811 Args.push_back(EmitScalarExpr(E->getArg(i)));
812 StringRef Arg = cast<StringLiteral>(E->getArg(ScopeArg)->IgnoreParenCasts())
813 ->getString();
814 llvm::MDNode *MD = llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, Arg)});
815 Args.push_back(llvm::MetadataAsValue::get(Ctx, MD));
816 // Intrinsic is typed based on the pointer AS. Pointer is always the first
817 // argument.
818 llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
819 return Builder.CreateCall(F, {Args});
820 }
821 case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
822 Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
823 {llvm::Type::getInt64Ty(getLLVMContext())});
824 return Builder.CreateCall(F);
825 }
826 case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
827 Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv,
828 {llvm::Type::getInt64Ty(getLLVMContext())});
829 llvm::Value *Env = EmitScalarExpr(E->getArg(0));
830 return Builder.CreateCall(F, {Env});
831 }
832 case AMDGPU::BI__builtin_amdgcn_read_exec:
833 return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
834 case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
835 return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
836 case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
837 return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
838 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
839 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
840 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
841 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
842 llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
843 llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
844 llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2));
845 llvm::Value *RayDir = EmitScalarExpr(E->getArg(3));
846 llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
847 llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));
848
849 // The builtins take these arguments as vec4 where the last element is
850 // ignored. The intrinsic takes them as vec3.
851 RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
852 {0, 1, 2});
853 RayDir =
854 Builder.CreateShuffleVector(RayDir, RayDir, {0, 1, 2});
855 RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
856 {0, 1, 2});
857
858 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
859 {NodePtr->getType(), RayDir->getType()});
860 return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
861 RayInverseDir, TextureDescr});
862 }
863 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
864 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
865 Intrinsic::ID IID;
866 switch (BuiltinID) {
867 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
868 IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
869 break;
870 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
871 IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
872 break;
873 }
874 llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
875 llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
876 llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2));
877 llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3));
878 llvm::Value *RayDir = EmitScalarExpr(E->getArg(4));
879 llvm::Value *Offset = EmitScalarExpr(E->getArg(5));
880 llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6));
881
882 Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7));
883 Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8));
884
885 llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
886
887 llvm::CallInst *CI = Builder.CreateCall(
888 IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
889 Offset, TextureDescr});
890
891 llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0);
892 llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1);
893 llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2);
894
895 Builder.CreateStore(RetRayOrigin, RetRayOriginPtr);
896 Builder.CreateStore(RetRayDir, RetRayDirPtr);
897
898 return RetVData;
899 }
900
901 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
902 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
903 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
904 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
905 Intrinsic::ID IID;
906 switch (BuiltinID) {
907 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
908 IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
909 break;
910 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
911 IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
912 break;
913 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
914 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
915 break;
916 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
917 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
918 break;
919 }
920
922 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
923 Args.push_back(EmitScalarExpr(E->getArg(i)));
924
925 Function *F = CGM.getIntrinsic(IID);
926 Value *Call = Builder.CreateCall(F, Args);
927 Value *Rtn = Builder.CreateExtractValue(Call, 0);
928 Value *A = Builder.CreateExtractValue(Call, 1);
929 llvm::Type *RetTy = ConvertType(E->getType());
930 Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
931 (uint64_t)0);
932 // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
933 // <2 x i64>, zext the second value.
934 if (A->getType()->getPrimitiveSizeInBits() <
935 RetTy->getScalarType()->getPrimitiveSizeInBits())
936 A = Builder.CreateZExt(A, RetTy->getScalarType());
937
938 return Builder.CreateInsertElement(I0, A, 1);
939 }
940 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
941 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
942 llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8);
943 Function *F = CGM.getIntrinsic(
944 BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
945 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
946 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
947 {VT, VT});
948
950 for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
951 Args.push_back(EmitScalarExpr(E->getArg(I)));
952 return Builder.CreateCall(F, Args);
953 }
954 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
955 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
956 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
957 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
958 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
959 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
960 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
961 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
962 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
963 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
964 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
965 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
966 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
967 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
968 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
969 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
970 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
971 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
972 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
973 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
974 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
975 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
976 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
977 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
978 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
979 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
980 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
981 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
982 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
983 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
984 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
985 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
986 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
987 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
988 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
989 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
990 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
991 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
992 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
993 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
994 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
995 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
996 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
997 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
998 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
999 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1000 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1001 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1002 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1003 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1004 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1005 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1006 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1007 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1008 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1009 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1010 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1011 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1012 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1013 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1014 // GFX1250 WMMA builtins
1015 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1016 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1017 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1018 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1019 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1020 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1021 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1022 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1023 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1024 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1025 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1026 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1027 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1028 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1029 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1030 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1031 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1032 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1033 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1034 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1035 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1036 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1037 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1038 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1039 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1040 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1041 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1042 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1043 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1044 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1045 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1046 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1047 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1048 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1049 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1050 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1051 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1052 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1053 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1054 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1055 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1056 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1057 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
1058
1059 // These operations perform a matrix multiplication and accumulation of
1060 // the form:
1061 // D = A * B + C
1062 // We need to specify one type for matrices AB and one for matrices CD.
1063 // Sparse matrix operations can have different types for A and B as well as
1064 // an additional type for sparsity index.
1065 // Destination type should be put before types used for source operands.
1066 SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
1067 // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
1068 // There is no need for the variable opsel argument, so always set it to
1069 // "false".
1070 bool AppendFalseForOpselArg = false;
1071 unsigned BuiltinWMMAOp;
1072 // Need return type when D and C are of different types.
1073 bool NeedReturnType = false;
1074
1075 switch (BuiltinID) {
1076 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
1077 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
1078 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
1079 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
1080 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1081 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
1082 break;
1083 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1084 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1085 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
1086 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
1087 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1088 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
1089 break;
1090 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
1091 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
1092 AppendFalseForOpselArg = true;
1093 [[fallthrough]];
1094 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1095 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
1096 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1097 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
1098 break;
1099 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
1100 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
1101 AppendFalseForOpselArg = true;
1102 [[fallthrough]];
1103 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1104 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
1105 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1106 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
1107 break;
1108 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1109 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
1110 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1111 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
1112 break;
1113 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1114 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
1115 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1116 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
1117 break;
1118 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1119 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1120 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1121 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
1122 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1123 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
1124 break;
1125 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1126 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1127 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1128 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
1129 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1130 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
1131 break;
1132 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1133 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
1134 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1135 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
1136 break;
1137 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1138 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
1139 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1140 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
1141 break;
1142 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1143 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
1144 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1145 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
1146 break;
1147 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1148 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
1149 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1150 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
1151 break;
1152 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1153 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
1154 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1155 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
1156 break;
1157 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1158 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
1159 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1160 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
1161 break;
1162 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1163 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
1164 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1165 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
1166 break;
1167 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1168 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
1169 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1170 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
1171 break;
1172 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1173 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1174 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1175 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
1176 break;
1177 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1178 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1179 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1180 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
1181 break;
1182 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1183 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1184 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1185 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
1186 break;
1187 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1188 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1189 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1190 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
1191 break;
1192 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1193 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1194 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1195 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
1196 break;
1197 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1198 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1199 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1200 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
1201 break;
1202 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1203 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1204 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1205 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
1206 break;
1207 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1208 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1209 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1210 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
1211 break;
1212 // GFX1250 WMMA builtins
1213 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1214 ArgsForMatchingMatrixTypes = {5, 1};
1215 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
1216 break;
1217 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1218 ArgsForMatchingMatrixTypes = {5, 1};
1219 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
1220 break;
1221 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1222 ArgsForMatchingMatrixTypes = {5, 1};
1223 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
1224 break;
1225 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1226 ArgsForMatchingMatrixTypes = {5, 1};
1227 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
1228 break;
1229 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1230 ArgsForMatchingMatrixTypes = {5, 1};
1231 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
1232 break;
1233 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1234 NeedReturnType = true;
1235 ArgsForMatchingMatrixTypes = {1, 5};
1236 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
1237 break;
1238 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1239 ArgsForMatchingMatrixTypes = {3, 0};
1240 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
1241 break;
1242 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1243 ArgsForMatchingMatrixTypes = {3, 0};
1244 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
1245 break;
1246 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1247 ArgsForMatchingMatrixTypes = {3, 0};
1248 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
1249 break;
1250 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1251 ArgsForMatchingMatrixTypes = {3, 0};
1252 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
1253 break;
1254 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1255 ArgsForMatchingMatrixTypes = {3, 0};
1256 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
1257 break;
1258 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1259 ArgsForMatchingMatrixTypes = {3, 0};
1260 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
1261 break;
1262 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1263 ArgsForMatchingMatrixTypes = {3, 0};
1264 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
1265 break;
1266 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1267 ArgsForMatchingMatrixTypes = {3, 0};
1268 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
1269 break;
1270 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1271 ArgsForMatchingMatrixTypes = {3, 0};
1272 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
1273 break;
1274 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1275 ArgsForMatchingMatrixTypes = {3, 0};
1276 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
1277 break;
1278 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1279 ArgsForMatchingMatrixTypes = {3, 0};
1280 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
1281 break;
1282 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1283 ArgsForMatchingMatrixTypes = {3, 0};
1284 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
1285 break;
1286 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1287 ArgsForMatchingMatrixTypes = {3, 0};
1288 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
1289 break;
1290 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1291 ArgsForMatchingMatrixTypes = {3, 0};
1292 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
1293 break;
1294 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1295 ArgsForMatchingMatrixTypes = {3, 0};
1296 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
1297 break;
1298 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1299 ArgsForMatchingMatrixTypes = {3, 0};
1300 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
1301 break;
1302 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1303 ArgsForMatchingMatrixTypes = {4, 1};
1304 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
1305 break;
1306 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1307 ArgsForMatchingMatrixTypes = {5, 1, 3};
1308 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4;
1309 break;
1310 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1311 ArgsForMatchingMatrixTypes = {5, 1, 3};
1312 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4;
1313 break;
1314 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1315 ArgsForMatchingMatrixTypes = {5, 1, 3};
1316 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4;
1317 break;
1318 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1319 ArgsForMatchingMatrixTypes = {3, 0, 1};
1320 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
1321 break;
1322 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1323 ArgsForMatchingMatrixTypes = {3, 0, 1};
1324 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4;
1325 break;
1326 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1327 ArgsForMatchingMatrixTypes = {3, 0, 1};
1328 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4;
1329 break;
1330 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1331 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1332 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
1333 break;
1334 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1335 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1336 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
1337 break;
1338 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1339 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1340 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
1341 break;
1342 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1343 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1344 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
1345 break;
1346 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1347 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1348 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
1349 break;
1350 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1351 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1352 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
1353 break;
1354 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1355 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1356 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
1357 break;
1358 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1359 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1360 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
1361 break;
1362 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1363 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1364 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
1365 break;
1366 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1367 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1368 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
1369 break;
1370 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1371 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1372 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
1373 break;
1374 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1375 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1376 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
1377 break;
1378 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1379 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1380 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
1381 break;
1382 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
1383 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1384 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
1385 break;
1386 }
1387
1389 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
1390 Args.push_back(EmitScalarExpr(E->getArg(i)));
1391 if (AppendFalseForOpselArg)
1392 Args.push_back(Builder.getFalse());
1393
1395 if (NeedReturnType)
1396 ArgTypes.push_back(ConvertType(E->getType()));
1397 for (auto ArgIdx : ArgsForMatchingMatrixTypes)
1398 ArgTypes.push_back(Args[ArgIdx]->getType());
1399
1400 Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
1401 return Builder.CreateCall(F, Args);
1402 }
1403 // amdgcn workgroup size
1404 case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
1405 return EmitAMDGPUWorkGroupSize(*this, 0);
1406 case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
1407 return EmitAMDGPUWorkGroupSize(*this, 1);
1408 case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
1409 return EmitAMDGPUWorkGroupSize(*this, 2);
1410
1411 // amdgcn grid size
1412 case AMDGPU::BI__builtin_amdgcn_grid_size_x:
1413 return EmitAMDGPUGridSize(*this, 0);
1414 case AMDGPU::BI__builtin_amdgcn_grid_size_y:
1415 return EmitAMDGPUGridSize(*this, 1);
1416 case AMDGPU::BI__builtin_amdgcn_grid_size_z:
1417 return EmitAMDGPUGridSize(*this, 2);
1418
1419 // r600 intrinsics
1420 case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
1421 case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
1423 Intrinsic::r600_recipsqrt_ieee);
1424 case AMDGPU::BI__builtin_amdgcn_alignbit: {
1425 llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
1426 llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
1427 llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
1428 Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
1429 return Builder.CreateCall(F, { Src0, Src1, Src2 });
1430 }
1431 case AMDGPU::BI__builtin_amdgcn_fence: {
1433 EmitScalarExpr(E->getArg(1)), AO, SSID);
1434 FenceInst *Fence = Builder.CreateFence(AO, SSID);
1435 if (E->getNumArgs() > 2)
1437 return Fence;
1438 }
1439 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1440 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1441 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1442 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1443 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1444 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1445 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1446 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1447 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1448 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1449 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1450 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1451 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1452 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1453 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1454 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1455 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1456 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1457 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1458 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1459 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1460 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1461 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1462 llvm::AtomicRMWInst::BinOp BinOp;
1463 switch (BuiltinID) {
1464 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1465 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1466 BinOp = llvm::AtomicRMWInst::UIncWrap;
1467 break;
1468 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1469 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1470 BinOp = llvm::AtomicRMWInst::UDecWrap;
1471 break;
1472 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1473 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1474 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1475 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1476 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1477 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1478 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1479 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1480 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1481 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1482 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1483 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1484 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1485 BinOp = llvm::AtomicRMWInst::FAdd;
1486 break;
1487 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1488 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1489 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1490 BinOp = llvm::AtomicRMWInst::FMin;
1491 break;
1492 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1493 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1494 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1495 BinOp = llvm::AtomicRMWInst::FMax;
1496 break;
1497 }
1498
1499 Address Ptr = CheckAtomicAlignment(*this, E);
1500 Value *Val = EmitScalarExpr(E->getArg(1));
1501 llvm::Type *OrigTy = Val->getType();
1502 QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
1503
1504 bool Volatile;
1505
1506 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
1507 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
1508 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1509 // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1510 Volatile =
1511 cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
1512 } else {
1513 // Infer volatile from the passed type.
1514 Volatile =
1515 PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1516 }
1517
1518 if (E->getNumArgs() >= 4) {
1519 // Some of the builtins have explicit ordering and scope arguments.
1521 EmitScalarExpr(E->getArg(3)), AO, SSID);
1522 } else {
1523 // Most of the builtins do not have syncscope/order arguments. For DS
1524 // atomics the scope doesn't really matter, as they implicitly operate at
1525 // workgroup scope.
1526 //
1527 // The global/flat cases need to use agent scope to consistently produce
1528 // the native instruction instead of a cmpxchg expansion.
1529 if (getTarget().getTriple().isSPIRV())
1530 SSID = getLLVMContext().getOrInsertSyncScopeID("device");
1531 else
1532 SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1533 AO = AtomicOrdering::Monotonic;
1534
1535 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
1536 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
1537 BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
1538 BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1539 llvm::Type *V2BF16Ty = FixedVectorType::get(
1540 llvm::Type::getBFloatTy(Builder.getContext()), 2);
1541 Val = Builder.CreateBitCast(Val, V2BF16Ty);
1542 }
1543 }
1544
1545 llvm::AtomicRMWInst *RMW =
1546 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1547 if (Volatile)
1548 RMW->setVolatile(true);
1549
1550 unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1551 if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1552 // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1553 // instruction for flat and global operations.
1554 llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
1555 RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
1556
1557 // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1558 // instruction, but this only matters for float fadd.
1559 if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1560 RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
1561 }
1562
1563 return Builder.CreateBitCast(RMW, OrigTy);
1564 }
1565 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1566 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1567 llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
1568 llvm::Type *ResultType = ConvertType(E->getType());
1569 // s_sendmsg_rtn is mangled using return type only.
1570 Function *F =
1571 CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
1572 return Builder.CreateCall(F, {Arg});
1573 }
1574 case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1575 case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1576 // Because builtin types are limited, and the intrinsic uses a struct/pair
1577 // output, marshal the pair-of-i32 to <2 x i32>.
1578 Value *VDstOld = EmitScalarExpr(E->getArg(0));
1579 Value *VSrcOld = EmitScalarExpr(E->getArg(1));
1580 Value *FI = EmitScalarExpr(E->getArg(2));
1581 Value *BoundCtrl = EmitScalarExpr(E->getArg(3));
1582 Function *F =
1583 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1584 ? Intrinsic::amdgcn_permlane16_swap
1585 : Intrinsic::amdgcn_permlane32_swap);
1586 llvm::CallInst *Call =
1587 Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl});
1588
1589 llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0);
1590 llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1);
1591
1592 llvm::Type *ResultType = ConvertType(E->getType());
1593
1594 llvm::Value *Insert0 = Builder.CreateInsertElement(
1595 llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0));
1596 llvm::Value *AsVector =
1597 Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1));
1598 return AsVector;
1599 }
1600 case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1601 case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1603 Intrinsic::amdgcn_bitop3);
1604 case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1605 // TODO: LLVM has this overloaded to allow for fat pointers, but since
1606 // those haven't been plumbed through to Clang yet, default to creating the
1607 // resource type.
1609 for (unsigned I = 0; I < 4; ++I)
1610 Args.push_back(EmitScalarExpr(E->getArg(I)));
1611 llvm::PointerType *RetTy = llvm::PointerType::get(
1612 Builder.getContext(), llvm::AMDGPUAS::BUFFER_RESOURCE);
1613 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc,
1614 {RetTy, Args[0]->getType()});
1615 return Builder.CreateCall(F, Args);
1616 }
1617 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1618 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1619 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1620 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1621 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1622 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1624 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store);
1625 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1626 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1627 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1628 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1629 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1630 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1631 llvm::Type *RetTy = nullptr;
1632 switch (BuiltinID) {
1633 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1634 RetTy = Int8Ty;
1635 break;
1636 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1637 RetTy = Int16Ty;
1638 break;
1639 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1640 RetTy = Int32Ty;
1641 break;
1642 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1643 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/2);
1644 break;
1645 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1646 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/3);
1647 break;
1648 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1649 RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/4);
1650 break;
1651 }
1652 Function *F =
1653 CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy);
1654 return Builder.CreateCall(
1655 F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
1656 EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
1657 }
1658 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32:
1660 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_add);
1661 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32:
1662 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16:
1664 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd);
1665 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32:
1666 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f64:
1668 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin);
1669 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32:
1670 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f64:
1672 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax);
1673 case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1675 *this, E, Intrinsic::amdgcn_s_prefetch_data);
1676 case Builtin::BIlogbf:
1677 case Builtin::BI__builtin_logbf: {
1678 Value *Src0 = EmitScalarExpr(E->getArg(0));
1679 Function *FrExpFunc = CGM.getIntrinsic(
1680 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1681 CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0);
1682 Value *Exp = Builder.CreateExtractValue(FrExp, 1);
1683 Value *Add = Builder.CreateAdd(
1684 Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true);
1685 Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getFloatTy());
1686 Value *Fabs =
1687 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1688 Value *FCmpONE = Builder.CreateFCmpONE(
1689 Fabs, ConstantFP::getInfinity(Builder.getFloatTy()));
1690 Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs);
1691 Value *FCmpOEQ =
1692 Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getFloatTy()));
1693 Value *Sel2 = Builder.CreateSelect(
1694 FCmpOEQ,
1695 ConstantFP::getInfinity(Builder.getFloatTy(), /*Negative=*/true), Sel1);
1696 return Sel2;
1697 }
1698 case Builtin::BIlogb:
1699 case Builtin::BI__builtin_logb: {
1700 Value *Src0 = EmitScalarExpr(E->getArg(0));
1701 Function *FrExpFunc = CGM.getIntrinsic(
1702 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1703 CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0);
1704 Value *Exp = Builder.CreateExtractValue(FrExp, 1);
1705 Value *Add = Builder.CreateAdd(
1706 Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true);
1707 Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getDoubleTy());
1708 Value *Fabs =
1709 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1710 Value *FCmpONE = Builder.CreateFCmpONE(
1711 Fabs, ConstantFP::getInfinity(Builder.getDoubleTy()));
1712 Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs);
1713 Value *FCmpOEQ =
1714 Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getDoubleTy()));
1715 Value *Sel2 = Builder.CreateSelect(
1716 FCmpOEQ,
1717 ConstantFP::getInfinity(Builder.getDoubleTy(), /*Negative=*/true),
1718 Sel1);
1719 return Sel2;
1720 }
1721 case Builtin::BIscalbnf:
1722 case Builtin::BI__builtin_scalbnf:
1723 case Builtin::BIscalbn:
1724 case Builtin::BI__builtin_scalbn:
1726 *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp);
1727 default:
1728 return nullptr;
1729 }
1730}
#define V(N, I)
static Value * emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E, Intrinsic::ID IntrinsicID, Intrinsic::ID ConstrainedIntrinsicID)
Address CheckAtomicAlignment(CodeGenFunction &CGF, const CallExpr *E)
llvm::Value * emitBuiltinWithOneOverloadedType(clang::CodeGen::CodeGenFunction &CGF, const clang::CallExpr *E, unsigned IntrinsicID, llvm::StringRef Name="")
Definition CGBuiltin.h:63
static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID)
Definition AMDGPU.cpp:314
static StringRef mapScopeToSPIRV(StringRef AMDGCNScope)
Definition AMDGPU.cpp:195
static Value * EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E, llvm::Type *RegisterType, llvm::Type *ValueType, bool isExecHi)
Definition AMDGPU.cpp:166
static Value * emitFPIntBuiltin(CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID)
Definition AMDGPU.cpp:185
TokenType getType() const
Returns the token's type, e.g.
#define X(type, name)
Definition Value.h:97
HLSLResourceBindingAttr::RegisterType RegisterType
Definition SemaHLSL.cpp:55
static QualType getPointeeType(const MemRegion *R)
Enumerates target-specific builtins in their own namespaces within namespace clang.
QualType GetBuiltinType(unsigned ID, GetBuiltinTypeError &Error, unsigned *IntegerConstantArgs=nullptr) const
Return the type for the specified builtin.
unsigned getTargetAddressSpace(LangAS AS) const
@ GE_None
No error.
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition Expr.h:2877
Expr * getArg(unsigned Arg)
getArg - Return the specified argument.
Definition Expr.h:3081
unsigned getNumArgs() const
getNumArgs - Return the number of actual arguments to this call.
Definition Expr.h:3068
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition CharUnits.h:63
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition Address.h:128
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition Address.h:209
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition Address.h:204
Address CreateGEP(CodeGenFunction &CGF, Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition CGBuilder.h:296
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition CGBuilder.h:112
llvm::LoadInst * CreateAlignedLoad(llvm::Type *Ty, llvm::Value *Addr, CharUnits Align, const llvm::Twine &Name="")
Definition CGBuilder.h:132
Address CreateAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition CGBuilder.h:193
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Value * EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx, const CallExpr *E)
Definition AMDGPU.cpp:274
llvm::Type * ConvertType(QualType T)
llvm::Value * EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E)
Definition AMDGPU.cpp:348
const TargetInfo & getTarget() const
void AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, const CallExpr *E)
Definition AMDGPU.cpp:291
Address EmitPointerWithAlignment(const Expr *Addr, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitPointerWithAlignment - Given an expression with a pointer type, emit the value and compute our be...
Definition CGExpr.cpp:1515
llvm::Value * EmitScalarExpr(const Expr *E, bool IgnoreResultAssign=false)
EmitScalarExpr - Emit the computation of the specified expression of LLVM scalar type,...
llvm::LLVMContext & getLLVMContext()
void ProcessOrderScopeAMDGCN(llvm::Value *Order, llvm::Value *Scope, llvm::AtomicOrdering &AO, llvm::SyncScope::ID &SSID)
Definition AMDGPU.cpp:209
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
ASTContext & getContext() const
llvm::Function * getIntrinsic(unsigned IID, ArrayRef< llvm::Type * > Tys={})
Expr * IgnoreParenCasts() LLVM_READONLY
Skip past any parentheses and casts which might surround this expression until reaching a fixed point...
Definition Expr.cpp:3090
std::optional< llvm::APSInt > getIntegerConstantExpr(const ASTContext &Ctx) const
isIntegerConstantExpr - Return the value if this expression is a valid integer constant expression.
Expr * IgnoreImpCasts() LLVM_READONLY
Skip past any implicit casts which might surround this expression until reaching a fixed point.
Definition Expr.cpp:3065
SourceLocation getExprLoc() const LLVM_READONLY
getExprLoc - Return the preferred location for the arrow when diagnosing a problem with a generic exp...
Definition Expr.cpp:273
QualType getType() const
Definition Expr.h:144
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition TypeBase.h:3328
A (possibly-)qualified type.
Definition TypeBase.h:937
bool isVolatileQualified() const
Determine whether this type is volatile-qualified.
Definition TypeBase.h:8371
Scope - A scope is a transient data structure that is used while parsing the program.
Definition Scope.h:41
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition TargetInfo.h:323
unsigned getMaxOpenCLWorkGroupSize() const
Definition TargetInfo.h:870
llvm::CodeObjectVersionKind CodeObjectVersion
Code object version for AMDGPU.
QualType getType() const
Definition Value.cpp:237
The JSON file list parser is used to communicate input to InstallAPI.
@ Result
The result type of a method or function.
Definition TypeBase.h:905
U cast(CodeGen::Address addr)
Definition Address.h:327
Diagnostic wrappers for TextAPI types for error reporting.
Definition Dominators.h:30
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64