clang 23.0.0git
gpuintrin.h
Go to the documentation of this file.
1//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Provides wrappers around the clang builtins for accessing GPU hardware
10// features. The interface is intended to be portable between architectures, but
11// some targets may provide different implementations. This header can be
12// included for all the common GPU programming languages, namely OpenMP, HIP,
13// CUDA, and OpenCL.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef __GPUINTRIN_H
18#define __GPUINTRIN_H
19
20#if !defined(_DEFAULT_FN_ATTRS)
21#if defined(__HIP__) || defined(__CUDA__)
22#define _DEFAULT_FN_ATTRS __attribute__((device))
23#else
24#define _DEFAULT_FN_ATTRS
25#endif
26#endif
27
28#include <stdint.h>
29
30#if !defined(__cplusplus)
31_Pragma("push_macro(\"bool\")");
32#define bool _Bool
33#endif
34
35_Pragma("omp begin declare target device_type(nohost)");
36_Pragma("omp begin declare variant match(device = {kind(gpu)})");
37
38// Forward declare a few functions for the implementation header.
39
40// Returns a bitmask marking all lanes that have the same value of __x.
41_DEFAULT_FN_ATTRS static __inline__ uint64_t
42__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);
43
44// Returns a bitmask marking all lanes that have the same value of __x.
45_DEFAULT_FN_ATTRS static __inline__ uint64_t
46__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x);
47
48// Returns the current lane mask if every lane contains __x.
49_DEFAULT_FN_ATTRS static __inline__ uint64_t
50__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x);
51
52// Returns the current lane mask if every lane contains __x.
53_DEFAULT_FN_ATTRS static __inline__ uint64_t
54__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
55
56_Pragma("omp end declare variant");
57_Pragma("omp end declare target");
58
59#if defined(__NVPTX__)
60#include <nvptxintrin.h>
61#elif defined(__AMDGPU__)
62#include <amdgpuintrin.h>
63#elif defined(__SPIRV__)
64#include <spirvintrin.h>
65#elif !defined(_OPENMP)
66#error "This header is only meant to be used on GPU architectures."
67#endif
68
69_Pragma("omp begin declare target device_type(nohost)");
70_Pragma("omp begin declare variant match(device = {kind(gpu)})");
71
72// Attribute to declare a function as a kernel.
73#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
74
75#define __GPU_X_DIM 0
76#define __GPU_Y_DIM 1
77#define __GPU_Z_DIM 2
78
79// Returns the number of blocks in the requested dimension.
80_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
81 switch (__dim) {
82 case 0:
83 return __gpu_num_blocks_x();
84 case 1:
85 return __gpu_num_blocks_y();
86 case 2:
87 return __gpu_num_blocks_z();
88 default:
89 return 1;
90 }
91}
92
93// Returns the number of block id in the requested dimension.
94_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
95 switch (__dim) {
96 case 0:
97 return __gpu_block_id_x();
98 case 1:
99 return __gpu_block_id_y();
100 case 2:
101 return __gpu_block_id_z();
102 default:
103 return 0;
104 }
105}
106
107// Returns the number of threads in the requested dimension.
108_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
109 switch (__dim) {
110 case 0:
111 return __gpu_num_threads_x();
112 case 1:
113 return __gpu_num_threads_y();
114 case 2:
115 return __gpu_num_threads_z();
116 default:
117 return 1;
118 }
119}
120
121// Returns the thread id in the requested dimension.
122_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
123 switch (__dim) {
124 case 0:
125 return __gpu_thread_id_x();
126 case 1:
127 return __gpu_thread_id_y();
128 case 2:
129 return __gpu_thread_id_z();
130 default:
131 return 0;
132 }
133}
134
135// Get the first active thread inside the lane.
136_DEFAULT_FN_ATTRS static __inline__ uint64_t
137__gpu_first_lane_id(uint64_t __lane_mask) {
138 return __builtin_ffsll(__lane_mask) - 1;
139}
140
141// Conditional that is only true for a single thread in a lane.
142_DEFAULT_FN_ATTRS static __inline__ bool
143__gpu_is_first_in_lane(uint64_t __lane_mask) {
144 return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
145}
146
147// Copies the value from the first active thread to the rest.
148_DEFAULT_FN_ATTRS static __inline__ uint64_t
149__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
150 uint32_t __hi = (uint32_t)(__x >> 32ull);
151 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
152 return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
153 ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
154 0xFFFFFFFFull);
155}
156
157// Gets the first floating point value from the active lanes.
158_DEFAULT_FN_ATTRS static __inline__ float
159__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
160 return __builtin_bit_cast(
161 float, __gpu_read_first_lane_u32(__lane_mask,
162 __builtin_bit_cast(uint32_t, __x)));
163}
164
165// Gets the first floating point value from the active lanes.
166_DEFAULT_FN_ATTRS static __inline__ double
167__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
168 return __builtin_bit_cast(
169 double, __gpu_read_first_lane_u64(__lane_mask,
170 __builtin_bit_cast(uint64_t, __x)));
171}
172
173// Shuffles the the lanes according to the given index.
174_DEFAULT_FN_ATTRS static __inline__ uint64_t
175__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
176 uint32_t __width) {
177 uint32_t __hi = (uint32_t)(__x >> 32ull);
178 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
179 uint32_t __mask = (uint32_t)__lane_mask;
180 return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
181 << 32ull) |
182 ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
183}
184
185// Shuffles the the lanes according to the given index.
186_DEFAULT_FN_ATTRS static __inline__ float
187__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
188 uint32_t __width) {
189 return __builtin_bit_cast(
190 float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
191 __builtin_bit_cast(uint32_t, __x), __width));
192}
193
194// Shuffles the the lanes according to the given index.
195_DEFAULT_FN_ATTRS static __inline__ double
196__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
197 uint32_t __width) {
198 return __builtin_bit_cast(
199 double,
200 __gpu_shuffle_idx_u64(__lane_mask, __idx,
201 __builtin_bit_cast(uint64_t, __x), __width));
202}
203
204// Gets the accumulator scan of the threads in the warp or wavefront.
205#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
206 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
207 uint64_t __lane_mask, uint32_t __x) { \
208 uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
209 bool __divergent = __gpu_read_first_lane_##__suffix( \
210 __lane_mask, __first & (__first + 1)); \
211 if (__divergent) { \
212 __type __accum = 0; \
213 for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \
214 __type __index = __builtin_ctzll(__mask); \
215 __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
216 __gpu_num_lanes()); \
217 __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \
218 __accum += __tmp; \
219 } \
220 } else { \
221 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
222 uint32_t __index = __gpu_lane_id() - __step; \
223 __bitmask_type bitmask = __gpu_lane_id() >= __step; \
224 __x += __builtin_bit_cast( \
225 __type, \
226 -bitmask & __builtin_bit_cast(__bitmask_type, \
227 __gpu_shuffle_idx_##__suffix( \
228 __lane_mask, __index, __x, \
229 __gpu_num_lanes()))); \
230 } \
231 } \
232 return __x; \
233 }
234__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
235__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
236__DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x)
237__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x)
238#undef __DO_LANE_SCAN
239
240// Gets the sum of all lanes inside the warp or wavefront.
241#define __DO_LANE_SUM(__type, __suffix) \
242 _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
243 uint64_t __lane_mask, __type __x) { \
244 uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
245 bool __divergent = __gpu_read_first_lane_##__suffix( \
246 __lane_mask, __first & (__first + 1)); \
247 if (__divergent) { \
248 return __gpu_shuffle_idx_##__suffix( \
249 __lane_mask, 63 - __builtin_clzll(__lane_mask), \
250 __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \
251 } else { \
252 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
253 uint32_t __index = __step + __gpu_lane_id(); \
254 __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
255 __gpu_num_lanes()); \
256 } \
257 return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
258 } \
259 }
260__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
261__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
262__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
263__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
264#undef __DO_LANE_SUM
265
266// Returns a bitmask marking all lanes that have the same value of __x.
267_DEFAULT_FN_ATTRS static __inline__ uint64_t
268__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
269 uint64_t __match_mask = 0;
270
271 bool __done = 0;
272 for (uint64_t __active_mask = __lane_mask; __active_mask;
273 __active_mask = __gpu_ballot(__lane_mask, !__done)) {
274 if (!__done) {
275 uint32_t __first = __gpu_read_first_lane_u32(__active_mask, __x);
276 if (__first == __x) {
277 __match_mask = __gpu_lane_mask();
278 __done = 1;
279 }
280 }
281 }
282 __gpu_sync_lane(__lane_mask);
283 return __match_mask;
284}
285
286// Returns a bitmask marking all lanes that have the same value of __x.
287_DEFAULT_FN_ATTRS static __inline__ uint64_t
288__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
289 uint64_t __match_mask = 0;
290
291 bool __done = 0;
292 for (uint64_t __active_mask = __lane_mask; __active_mask;
293 __active_mask = __gpu_ballot(__lane_mask, !__done)) {
294 if (!__done) {
295 uint64_t __first = __gpu_read_first_lane_u64(__active_mask, __x);
296 if (__first == __x) {
297 __match_mask = __gpu_lane_mask();
298 __done = 1;
299 }
300 }
301 }
302 __gpu_sync_lane(__lane_mask);
303 return __match_mask;
304}
305
306// Returns the current lane mask if every lane contains __x.
307_DEFAULT_FN_ATTRS static __inline__ uint64_t
308__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
309 uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
310 uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
311 __gpu_sync_lane(__lane_mask);
312 return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
313}
314
315// Returns the current lane mask if every lane contains __x.
316_DEFAULT_FN_ATTRS static __inline__ uint64_t
317__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
318 uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
319 uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
320 __gpu_sync_lane(__lane_mask);
321 return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
322}
323
324_Pragma("omp end declare variant");
325_Pragma("omp end declare target");
326
327#if !defined(__cplusplus)
328_Pragma("pop_macro(\"bool\")");
329#endif
330
331#undef _DEFAULT_FN_ATTRS
332
333#endif // __GPUINTRIN_H
__DEVICE__ unsigned int __ballot(int __a)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
#define _DEFAULT_FN_ATTRS
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id(int __dim)
Definition gpuintrin.h:94
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:149
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x)
Definition gpuintrin.h:268
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, uint32_t __width)
Definition gpuintrin.h:175
static _DEFAULT_FN_ATTRS __inline__ double __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x)
Definition gpuintrin.h:167
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:317
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x)
Definition gpuintrin.h:308
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_first_in_lane(uint64_t __lane_mask)
Definition gpuintrin.h:143
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads(int __dim)
Definition gpuintrin.h:108
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id(int __dim)
Definition gpuintrin.h:122
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_first_lane_id(uint64_t __lane_mask)
Definition gpuintrin.h:137
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)
Definition gpuintrin.h:205
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:288
static _DEFAULT_FN_ATTRS __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, uint32_t __width)
Definition gpuintrin.h:187
#define __DO_LANE_SUM(__type, __suffix)
Definition gpuintrin.h:241
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks(int __dim)
Definition gpuintrin.h:80
static _DEFAULT_FN_ATTRS __inline__ float __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x)
Definition gpuintrin.h:159
static _DEFAULT_FN_ATTRS __inline__ double __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, uint32_t __width)
Definition gpuintrin.h:196
_Pragma("push_macro(\"bool\")")