clang 20.0.0git
amdgpuintrin.h
Go to the documentation of this file.
1//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef __AMDGPUINTRIN_H
10#define __AMDGPUINTRIN_H
11
12#ifndef __AMDGPU__
13#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
14#endif
15
16#include <stdint.h>
17
18#if !defined(__cplusplus)
19_Pragma("push_macro(\"bool\")");
20#define bool _Bool
21#endif
22
23_Pragma("omp begin declare target device_type(nohost)");
24_Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
25
26// Type aliases to the address spaces used by the AMDGPU backend.
27#define __gpu_private __attribute__((address_space(5)))
28#define __gpu_constant __attribute__((address_space(4)))
29#define __gpu_local __attribute__((address_space(3)))
30#define __gpu_global __attribute__((address_space(1)))
31#define __gpu_generic __attribute__((address_space(0)))
32
33// Attribute to declare a function as a kernel.
34#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
35
36// Returns the number of workgroups in the 'x' dimension of the grid.
37_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
38 return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
39}
40
41// Returns the number of workgroups in the 'y' dimension of the grid.
42_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
43 return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
44}
45
46// Returns the number of workgroups in the 'z' dimension of the grid.
47_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
48 return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
49}
50
51// Returns the 'x' dimension of the current AMD workgroup's id.
52_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
53 return __builtin_amdgcn_workgroup_id_x();
54}
55
56// Returns the 'y' dimension of the current AMD workgroup's id.
57_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
58 return __builtin_amdgcn_workgroup_id_y();
59}
60
61// Returns the 'z' dimension of the current AMD workgroup's id.
62_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
63 return __builtin_amdgcn_workgroup_id_z();
64}
65
66// Returns the number of workitems in the 'x' dimension.
67_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
68 return __builtin_amdgcn_workgroup_size_x();
69}
70
71// Returns the number of workitems in the 'y' dimension.
72_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
73 return __builtin_amdgcn_workgroup_size_y();
74}
75
76// Returns the number of workitems in the 'z' dimension.
77_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
78 return __builtin_amdgcn_workgroup_size_z();
79}
80
81// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
82_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
83 return __builtin_amdgcn_workitem_id_x();
84}
85
86// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
87_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
88 return __builtin_amdgcn_workitem_id_y();
89}
90
91// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
92_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
93 return __builtin_amdgcn_workitem_id_z();
94}
95
96// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
97// and compilation options.
98_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
99 return __builtin_amdgcn_wavefrontsize();
100}
101
102// Returns the id of the thread inside of an AMD wavefront executing together.
103_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
104 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
105}
106
107// Returns the bit-mask of active threads in the current wavefront.
108_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
109 return __builtin_amdgcn_read_exec();
110}
111
112// Copies the value from the first active thread in the wavefront to the rest.
113_DEFAULT_FN_ATTRS static __inline__ uint32_t
114__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
115 return __builtin_amdgcn_readfirstlane(__x);
116}
117
118// Copies the value from the first active thread in the wavefront to the rest.
120__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
121 uint32_t __hi = (uint32_t)(__x >> 32ull);
122 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
123 return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
124 ((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
125}
126
127// Returns a bitmask of threads in the current lane for which \p x is true.
128_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
129 bool __x) {
130 // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
131 // the active threads
132 return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
133}
134
135// Waits for all the threads in the block to converge and issues a fence.
136_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
137 __builtin_amdgcn_s_barrier();
138 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
139}
140
141// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
142_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
143 __builtin_amdgcn_wave_barrier();
144}
145
146// Shuffles the the lanes inside the wavefront according to the given index.
147_DEFAULT_FN_ATTRS static __inline__ uint32_t
148__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
149 return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
150}
151
152// Shuffles the the lanes inside the wavefront according to the given index.
153_DEFAULT_FN_ATTRS static __inline__ uint64_t
154__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
155 uint32_t __hi = (uint32_t)(__x >> 32ull);
156 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
157 return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
158 ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
159}
160
161// Returns true if the flat pointer points to CUDA 'shared' memory.
162_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
163 return __builtin_amdgcn_is_shared((void __attribute__((address_space(0))) *)((
164 void [[clang::opencl_generic]] *)ptr));
165}
166
167// Returns true if the flat pointer points to CUDA 'local' memory.
168_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
169 return __builtin_amdgcn_is_private((void __attribute__((
170 address_space(0))) *)((void [[clang::opencl_generic]] *)ptr));
171}
172
173// Terminates execution of the associated wavefront.
174_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
175 __builtin_amdgcn_endpgm();
176}
177
178// Suspend the thread briefly to assist the scheduler during busy loops.
179_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
180 __builtin_amdgcn_s_sleep(2);
181}
182
183_Pragma("omp end declare variant");
184_Pragma("omp end declare target");
185
186#if !defined(__cplusplus)
187_Pragma("pop_macro(\"bool\")");
188#endif
189
190#endif // __AMDGPUINTRIN_H
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static _DEFAULT_FN_ATTRS __inline__ void __gpu_thread_suspend(void)
Definition: amdgpuintrin.h:179
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
Definition: amdgpuintrin.h:82
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
Definition: amdgpuintrin.h:103
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
Definition: amdgpuintrin.h:77
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
Definition: amdgpuintrin.h:108
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x)
Definition: amdgpuintrin.h:148
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
Definition: amdgpuintrin.h:114
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
Definition: amdgpuintrin.h:57
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
Definition: amdgpuintrin.h:92
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
Definition: amdgpuintrin.h:142
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_private(void *ptr)
Definition: amdgpuintrin.h:168
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_local(void *ptr)
Definition: amdgpuintrin.h:162
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
Definition: amdgpuintrin.h:37
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
Definition: amdgpuintrin.h:62
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
Definition: amdgpuintrin.h:72
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x)
Definition: amdgpuintrin.h:154
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
Definition: amdgpuintrin.h:67
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
Definition: amdgpuintrin.h:98
_DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
Definition: amdgpuintrin.h:120
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
Definition: amdgpuintrin.h:87
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
Definition: amdgpuintrin.h:47
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
Definition: amdgpuintrin.h:52
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
Definition: amdgpuintrin.h:128
static _DEFAULT_FN_ATTRS __inline__ void __gpu_exit(void)
Definition: amdgpuintrin.h:174
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_threads(void)
Definition: amdgpuintrin.h:136
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
Definition: amdgpuintrin.h:42
_Pragma("push_macro(\"bool\")")
#define _DEFAULT_FN_ATTRS
Definition: enqcmdintrin.h:18
unsigned long uint64_t
unsigned int uint32_t
#define noreturn
Definition: stdnoreturn.h:17