clang 23.0.0git
gpuintrin.h
Go to the documentation of this file.
1//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Provides wrappers around the clang builtins for accessing GPU hardware
10// features. The interface is intended to be portable between architectures, but
11// some targets may provide different implementations. This header can be
12// included for all the common GPU programming languages, namely OpenMP, HIP,
13// CUDA, and OpenCL.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef __GPUINTRIN_H
18#define __GPUINTRIN_H
19
20#if !defined(_DEFAULT_FN_ATTRS)
21#if defined(__HIP__) || defined(__CUDA__)
22#define _DEFAULT_FN_ATTRS __attribute__((device))
23#else
24#define _DEFAULT_FN_ATTRS
25#endif
26#endif
27
28#include <stdint.h>
29
30#if !defined(__cplusplus)
31_Pragma("push_macro(\"bool\")");
32#define bool _Bool
33#endif
34
35_Pragma("omp begin declare target device_type(nohost)");
36_Pragma("omp begin declare variant match(device = {kind(gpu)})");
37
38// Forward declare a few functions for the implementation header.
39
40// Returns a bitmask marking all lanes that have the same value of __x.
41_DEFAULT_FN_ATTRS static __inline__ uint64_t
42__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);
43
44// Returns a bitmask marking all lanes that have the same value of __x.
45_DEFAULT_FN_ATTRS static __inline__ uint64_t
46__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x);
47
48// Returns the current lane mask if every lane contains __x.
49_DEFAULT_FN_ATTRS static __inline__ uint64_t
50__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x);
51
52// Returns the current lane mask if every lane contains __x.
53_DEFAULT_FN_ATTRS static __inline__ uint64_t
54__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
55
56_Pragma("omp end declare variant");
57_Pragma("omp end declare target");
58
59#if defined(__NVPTX__)
60#include <nvptxintrin.h>
61#elif defined(__AMDGPU__)
62#include <amdgpuintrin.h>
63#elif defined(__SPIRV__)
64#include <spirvintrin.h>
65#elif !defined(_OPENMP)
66#error "This header is only meant to be used on GPU architectures."
67#endif
68
69_Pragma("omp begin declare target device_type(nohost)");
70_Pragma("omp begin declare variant match(device = {kind(gpu)})");
71
72// Attribute to declare a function as a kernel.
73#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
74
75#define __GPU_X_DIM 0
76#define __GPU_Y_DIM 1
77#define __GPU_Z_DIM 2
78
79// Returns the number of blocks in the requested dimension.
80_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
81 switch (__dim) {
82 case 0:
83 return __gpu_num_blocks_x();
84 case 1:
85 return __gpu_num_blocks_y();
86 case 2:
87 return __gpu_num_blocks_z();
88 default:
89 return 1;
90 }
91}
92
93// Returns the number of block id in the requested dimension.
94_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
95 switch (__dim) {
96 case 0:
97 return __gpu_block_id_x();
98 case 1:
99 return __gpu_block_id_y();
100 case 2:
101 return __gpu_block_id_z();
102 default:
103 return 0;
104 }
105}
106
107// Returns the number of threads in the requested dimension.
108_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
109 switch (__dim) {
110 case 0:
111 return __gpu_num_threads_x();
112 case 1:
113 return __gpu_num_threads_y();
114 case 2:
115 return __gpu_num_threads_z();
116 default:
117 return 1;
118 }
119}
120
121// Returns the thread id in the requested dimension.
122_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
123 switch (__dim) {
124 case 0:
125 return __gpu_thread_id_x();
126 case 1:
127 return __gpu_thread_id_y();
128 case 2:
129 return __gpu_thread_id_z();
130 default:
131 return 0;
132 }
133}
134
135// Get the first active thread inside the lane.
136_DEFAULT_FN_ATTRS static __inline__ uint64_t
137__gpu_first_lane_id(uint64_t __lane_mask) {
138 return __builtin_ffsll(__lane_mask) - 1;
139}
140
141// Conditional that is only true for a single thread in a lane.
142_DEFAULT_FN_ATTRS static __inline__ bool
143__gpu_is_first_in_lane(uint64_t __lane_mask) {
144 return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
145}
146
147// Copies the value from the first active thread to the rest.
148_DEFAULT_FN_ATTRS static __inline__ uint64_t
149__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
150 uint32_t __hi = (uint32_t)(__x >> 32ull);
151 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
152 return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
153 ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
154 0xFFFFFFFFull);
155}
156
157// Gets the first floating point value from the active lanes.
158_DEFAULT_FN_ATTRS static __inline__ float
159__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
160 return __builtin_bit_cast(
161 float, __gpu_read_first_lane_u32(__lane_mask,
162 __builtin_bit_cast(uint32_t, __x)));
163}
164
165// Gets the first floating point value from the active lanes.
166_DEFAULT_FN_ATTRS static __inline__ double
167__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
168 return __builtin_bit_cast(
169 double, __gpu_read_first_lane_u64(__lane_mask,
170 __builtin_bit_cast(uint64_t, __x)));
171}
172
173// Shuffles the the lanes according to the given index.
174_DEFAULT_FN_ATTRS static __inline__ uint64_t
175__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
176 uint32_t __width) {
177 uint32_t __hi = (uint32_t)(__x >> 32ull);
178 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
179 uint32_t __mask = (uint32_t)__lane_mask;
180 return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
181 << 32ull) |
182 ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
183}
184
185// Shuffles the the lanes according to the given index.
186_DEFAULT_FN_ATTRS static __inline__ float
187__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
188 uint32_t __width) {
189 return __builtin_bit_cast(
190 float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
191 __builtin_bit_cast(uint32_t, __x), __width));
192}
193
194// Shuffles the the lanes according to the given index.
195_DEFAULT_FN_ATTRS static __inline__ double
196__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
197 uint32_t __width) {
198 return __builtin_bit_cast(
199 double,
200 __gpu_shuffle_idx_u64(__lane_mask, __idx,
201 __builtin_bit_cast(uint64_t, __x), __width));
202}
203
204// Implements scan and reduction operations across a GPU warp or wavefront.
205//
206// Both scans work by iterating log2(N) steps. The bitmask tracks the currently
207// unprocessed lanes, above or below the current lane in the case of a suffix or
208// prefix scan. Each iteration we shuffle in the unprocessed neighbors and then
209// clear the bits that this operation handled.
210#define __DO_LANE_OP(__type, __op, __identity, __prefix, __suffix) \
211 _DEFAULT_FN_ATTRS static __inline__ __type \
212 __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
213 __type __x) { \
214 uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id()); \
215 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
216 uint32_t __src = __above ? __builtin_ctzg(__above) : __gpu_lane_id(); \
217 __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
218 __gpu_num_lanes()); \
219 __x = __x __op(__above ? __result : (__type)__identity); \
220 for (uint32_t __i = 0; __i < __step; ++__i) \
221 __above &= __above - 1; \
222 } \
223 return __x; \
224 } \
225 \
226 _DEFAULT_FN_ATTRS static __inline__ __type \
227 __gpu_prefix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
228 __type __x) { \
229 uint64_t __below = __lane_mask & ((1ull << __gpu_lane_id()) - 1); \
230 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
231 uint32_t __src = \
232 __below ? (63 - __builtin_clzg(__below)) : __gpu_lane_id(); \
233 __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
234 __gpu_num_lanes()); \
235 __x = __x __op(__below ? __result : (__type)__identity); \
236 for (uint32_t __i = 0; __i < __step; ++__i) \
237 __below ^= (1ull << (63 - __builtin_clzg(__below, 0))) & __below; \
238 } \
239 return __x; \
240 } \
241 \
242 _DEFAULT_FN_ATTRS static __inline__ __type \
243 __gpu_lane_##__prefix##_##__suffix(uint64_t __lane_mask, __type __x) { \
244 return __gpu_read_first_lane_##__suffix( \
245 __lane_mask, \
246 __gpu_suffix_scan_##__prefix##_##__suffix(__lane_mask, __x)); \
247 }
248__DO_LANE_OP(uint32_t, +, 0, sum, u32);
249__DO_LANE_OP(uint64_t, +, 0, sum, u64);
250__DO_LANE_OP(float, +, 0, sum, f32);
251__DO_LANE_OP(double, +, 0, sum, f64);
252#undef __DO_LANE_OP
253
254// Returns a bitmask marking all lanes that have the same value of __x.
255_DEFAULT_FN_ATTRS static __inline__ uint64_t
256__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
257 uint64_t __match_mask = 0;
258
259 bool __done = 0;
260 for (uint64_t __active_mask = __lane_mask; __active_mask;
261 __active_mask = __gpu_ballot(__lane_mask, !__done)) {
262 if (!__done) {
263 uint32_t __first = __gpu_shuffle_idx_u32(
264 __active_mask, __builtin_ctzg(__active_mask), __x, __gpu_num_lanes());
265 uint64_t __ballot = __gpu_ballot(__active_mask, __first == __x);
266 if (__first == __x) {
267 __match_mask = __ballot;
268 __done = 1;
269 }
270 }
271 }
272 return __match_mask;
273}
274
275// Returns a bitmask marking all lanes that have the same value of __x.
276_DEFAULT_FN_ATTRS static __inline__ uint64_t
277__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
278 uint64_t __match_mask = 0;
279
280 bool __done = 0;
281 for (uint64_t __active_mask = __lane_mask; __active_mask;
282 __active_mask = __gpu_ballot(__lane_mask, !__done)) {
283 if (!__done) {
284 uint64_t __first = __gpu_shuffle_idx_u64(
285 __active_mask, __builtin_ctzg(__active_mask), __x, __gpu_num_lanes());
286 uint64_t __ballot = __gpu_ballot(__active_mask, __first == __x);
287 if (__first == __x) {
288 __match_mask = __ballot;
289 __done = 1;
290 }
291 }
292 }
293 return __match_mask;
294}
295
296// Returns the current lane mask if every lane contains __x.
297_DEFAULT_FN_ATTRS static __inline__ uint64_t
298__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
299 uint32_t __first = __gpu_shuffle_idx_u32(
300 __lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
301 uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
302 return __ballot == __lane_mask ? __lane_mask : 0ull;
303}
304
305// Returns the current lane mask if every lane contains __x.
306_DEFAULT_FN_ATTRS static __inline__ uint64_t
307__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
308 uint64_t __first = __gpu_shuffle_idx_u64(
309 __lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
310 uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
311 return __ballot == __lane_mask ? __lane_mask : 0ull;
312}
313
314_Pragma("omp end declare variant");
315_Pragma("omp end declare target");
316
317#if !defined(__cplusplus)
318_Pragma("pop_macro(\"bool\")");
319#endif
320
321#undef _DEFAULT_FN_ATTRS
322
323#endif // __GPUINTRIN_H
__DEVICE__ unsigned int __ballot(int __a)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
#define _DEFAULT_FN_ATTRS
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id(int __dim)
Definition gpuintrin.h:94
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:149
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x)
Definition gpuintrin.h:256
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, uint32_t __width)
Definition gpuintrin.h:175
static _DEFAULT_FN_ATTRS __inline__ double __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x)
Definition gpuintrin.h:167
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:307
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x)
Definition gpuintrin.h:298
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_first_in_lane(uint64_t __lane_mask)
Definition gpuintrin.h:143
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads(int __dim)
Definition gpuintrin.h:108
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id(int __dim)
Definition gpuintrin.h:122
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_first_lane_id(uint64_t __lane_mask)
Definition gpuintrin.h:137
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:277
static _DEFAULT_FN_ATTRS __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, uint32_t __width)
Definition gpuintrin.h:187
#define __DO_LANE_OP(__type, __op, __identity, __prefix, __suffix)
Definition gpuintrin.h:210
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks(int __dim)
Definition gpuintrin.h:80
static _DEFAULT_FN_ATTRS __inline__ float __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x)
Definition gpuintrin.h:159
static _DEFAULT_FN_ATTRS __inline__ double __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, uint32_t __width)
Definition gpuintrin.h:196
_Pragma("push_macro(\"bool\")")