clang 23.0.0git
gpuintrin.h
Go to the documentation of this file.
1//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Provides wrappers around the clang builtins for accessing GPU hardware
10// features. The interface is intended to be portable between architectures, but
11// some targets may provide different implementations. This header can be
12// included for all the common GPU programming languages, namely OpenMP, HIP,
13// CUDA, and OpenCL.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef __GPUINTRIN_H
18#define __GPUINTRIN_H
19
20#if !defined(_DEFAULT_FN_ATTRS)
21#if defined(__HIP__) || defined(__CUDA__)
22#define _DEFAULT_FN_ATTRS __attribute__((device))
23#else
24#define _DEFAULT_FN_ATTRS
25#endif
26#endif
27
28#include <stdint.h>
29
30#if !defined(__cplusplus)
31_Pragma("push_macro(\"bool\")");
32#define bool _Bool
33#endif
34
35#if defined(__NVPTX__)
36#include <nvptxintrin.h>
37#elif defined(__AMDGPU__)
38#include <amdgpuintrin.h>
39#elif defined(__SPIRV__)
40#include <spirvintrin.h>
41#elif !defined(_OPENMP)
42#error "This header is only meant to be used on GPU architectures."
43#endif
44
45_Pragma("omp begin declare target device_type(nohost)");
46_Pragma("omp begin declare variant match(device = {kind(gpu)})");
47
48// Attribute to declare a function as a kernel.
49#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
50
51#define __GPU_X_DIM 0
52#define __GPU_Y_DIM 1
53#define __GPU_Z_DIM 2
54
55// Returns the number of blocks in the requested dimension.
56_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
57 switch (__dim) {
58 case 0:
59 return __gpu_num_blocks_x();
60 case 1:
61 return __gpu_num_blocks_y();
62 case 2:
63 return __gpu_num_blocks_z();
64 default:
65 return 1;
66 }
67}
68
69// Returns the number of block id in the requested dimension.
70_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
71 switch (__dim) {
72 case 0:
73 return __gpu_block_id_x();
74 case 1:
75 return __gpu_block_id_y();
76 case 2:
77 return __gpu_block_id_z();
78 default:
79 return 0;
80 }
81}
82
83// Returns the number of threads in the requested dimension.
84_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
85 switch (__dim) {
86 case 0:
87 return __gpu_num_threads_x();
88 case 1:
89 return __gpu_num_threads_y();
90 case 2:
91 return __gpu_num_threads_z();
92 default:
93 return 1;
94 }
95}
96
97// Returns the thread id in the requested dimension.
98_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
99 switch (__dim) {
100 case 0:
101 return __gpu_thread_id_x();
102 case 1:
103 return __gpu_thread_id_y();
104 case 2:
105 return __gpu_thread_id_z();
106 default:
107 return 0;
108 }
109}
110
111// Get the first active thread inside the lane.
112_DEFAULT_FN_ATTRS static __inline__ uint64_t
113__gpu_first_lane_id(uint64_t __lane_mask) {
114 return __builtin_ffsll(__lane_mask) - 1;
115}
116
117// Conditional that is only true for a single thread in a lane.
118_DEFAULT_FN_ATTRS static __inline__ bool
119__gpu_is_first_in_lane(uint64_t __lane_mask) {
120 return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
121}
122
123// Copies the value from the first active thread to the rest.
124_DEFAULT_FN_ATTRS static __inline__ uint64_t
125__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
126 uint32_t __hi = (uint32_t)(__x >> 32);
127 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
128 return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32) |
129 ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) & 0xFFFFFFFF);
130}
131
132// Gets the first floating point value from the active lanes.
133_DEFAULT_FN_ATTRS static __inline__ float
134__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
135 return __builtin_bit_cast(
136 float, __gpu_read_first_lane_u32(__lane_mask,
137 __builtin_bit_cast(uint32_t, __x)));
138}
139
140// Gets the first floating point value from the active lanes.
141_DEFAULT_FN_ATTRS static __inline__ double
142__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
143 return __builtin_bit_cast(
144 double, __gpu_read_first_lane_u64(__lane_mask,
145 __builtin_bit_cast(uint64_t, __x)));
146}
147
148// Shuffles the the lanes according to the given index.
149_DEFAULT_FN_ATTRS static __inline__ uint64_t
150__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
151 uint32_t __width) {
152 uint32_t __hi = (uint32_t)(__x >> 32);
153 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
154 uint32_t __mask = (uint32_t)__lane_mask;
155 return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) << 32) |
156 ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
157}
158
159// Shuffles the the lanes according to the given index.
160_DEFAULT_FN_ATTRS static __inline__ float
161__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
162 uint32_t __width) {
163 return __builtin_bit_cast(
164 float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
165 __builtin_bit_cast(uint32_t, __x), __width));
166}
167
168// Shuffles the the lanes according to the given index.
169_DEFAULT_FN_ATTRS static __inline__ double
170__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
171 uint32_t __width) {
172 return __builtin_bit_cast(
173 double,
174 __gpu_shuffle_idx_u64(__lane_mask, __idx,
175 __builtin_bit_cast(uint64_t, __x), __width));
176}
177
178// Implements scan and reduction operations across a GPU warp or wavefront.
179//
180// Both scans work by iterating log2(N) steps. The bitmask tracks the currently
181// unprocessed lanes, above or below the current lane in the case of a suffix or
182// prefix scan. Each iteration we shuffle in the unprocessed neighbors and then
183// clear the bits that this operation handled.
184#define __DO_LANE_OPS(__type, __op, __identity, __prefix, __suffix) \
185 _DEFAULT_FN_ATTRS static __inline__ __type \
186 __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
187 __type __x) { \
188 uint64_t __above = __lane_mask & -(UINT64_C(2) << __gpu_lane_id()); \
189 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
190 uint32_t __src = __builtin_ctzg(__above, (int)sizeof(__above) * 8); \
191 __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
192 __gpu_num_lanes()); \
193 __x = __op(__x, __above ? __result : (__type)__identity); \
194 for (uint32_t __i = 0; __i < __step; ++__i) \
195 __above &= __above - 1; \
196 } \
197 return __x; \
198 } \
199 \
200 _DEFAULT_FN_ATTRS static __inline__ __type \
201 __gpu_prefix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
202 __type __x) { \
203 uint64_t __below = __lane_mask & ((UINT64_C(1) << __gpu_lane_id()) - 1); \
204 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
205 uint32_t __src = 63 - __builtin_clzg(__below, (int)sizeof(__below) * 8); \
206 __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
207 __gpu_num_lanes()); \
208 __x = __op(__x, __below ? __result : (__type)__identity); \
209 for (uint32_t __i = 0; __i < __step; ++__i) \
210 __below ^= \
211 (UINT64_C(1) << (63 - __builtin_clzg(__below, 0))) & __below; \
212 } \
213 return __x; \
214 } \
215 \
216 _DEFAULT_FN_ATTRS static __inline__ __type \
217 __gpu_lane_##__prefix##_##__suffix(uint64_t __lane_mask, __type __x) { \
218 return __gpu_read_first_lane_##__suffix( \
219 __lane_mask, \
220 __gpu_suffix_scan_##__prefix##_##__suffix(__lane_mask, __x)); \
221 }
222
223#define __GPU_OP(__x, __y) ((__x) + (__y))
224__DO_LANE_OPS(uint32_t, __GPU_OP, 0, add, u32);
225__DO_LANE_OPS(uint64_t, __GPU_OP, 0, add, u64);
226__DO_LANE_OPS(float, __GPU_OP, 0, add, f32);
227__DO_LANE_OPS(double, __GPU_OP, 0, add, f64);
228#undef __GPU_OP
229
230#define __GPU_OP(__x, __y) ((__x) & (__y))
231__DO_LANE_OPS(uint32_t, __GPU_OP, UINT32_MAX, and, u32);
232__DO_LANE_OPS(uint64_t, __GPU_OP, UINT64_MAX, and, u64);
233#undef __GPU_OP
234
235#define __GPU_OP(__x, __y) ((__x) | (__y))
236__DO_LANE_OPS(uint32_t, __GPU_OP, 0, or, u32);
237__DO_LANE_OPS(uint64_t, __GPU_OP, 0, or, u64);
238#undef __GPU_OP
239
240#define __GPU_OP(__x, __y) ((__x) ^ (__y))
241__DO_LANE_OPS(uint32_t, __GPU_OP, 0, xor, u32);
242__DO_LANE_OPS(uint64_t, __GPU_OP, 0, xor, u64);
243#undef __GPU_OP
244
245#define __GPU_OP(__x, __y) ((__x) < (__y) ? (__x) : (__y))
246__DO_LANE_OPS(uint32_t, __GPU_OP, UINT32_MAX, min, u32);
247__DO_LANE_OPS(uint64_t, __GPU_OP, UINT64_MAX, min, u64);
248#undef __GPU_OP
249
250#define __GPU_OP(__x, __y) ((__x) > (__y) ? (__x) : (__y))
251__DO_LANE_OPS(uint32_t, __GPU_OP, 0, max, u32);
252__DO_LANE_OPS(uint64_t, __GPU_OP, 0, max, u64);
253#undef __GPU_OP
254
255#define __GPU_OP(__x, __y) __builtin_elementwise_minnum((__x), (__y))
256__DO_LANE_OPS(float, __GPU_OP, __builtin_inff(), minnum, f32);
257__DO_LANE_OPS(double, __GPU_OP, __builtin_inf(), minnum, f64);
258#undef __GPU_OP
259
260#define __GPU_OP(__x, __y) __builtin_elementwise_maxnum((__x), (__y))
261__DO_LANE_OPS(float, __GPU_OP, -__builtin_inff(), maxnum, f32);
262__DO_LANE_OPS(double, __GPU_OP, -__builtin_inf(), maxnum, f64);
263#undef __GPU_OP
264
265#undef __DO_LANE_OPS
266
267// Returns a bitmask marking all lanes that have the same value of __x.
268#ifndef __gpu_match_any_u32_impl
269_DEFAULT_FN_ATTRS static __inline__ uint64_t
270__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
271 uint64_t __match_mask = 0;
272
273 bool __done = 0;
274 for (uint64_t __active_mask = __lane_mask; __active_mask;
275 __active_mask = __gpu_ballot(__lane_mask, !__done)) {
276 if (!__done) {
277 uint32_t __first = __gpu_shuffle_idx_u32(
278 __active_mask, __builtin_ctzg(__active_mask), __x, __gpu_num_lanes());
279 uint64_t __ballot = __gpu_ballot(__active_mask, __first == __x);
280 if (__first == __x) {
281 __match_mask = __ballot;
282 __done = 1;
283 }
284 }
285 }
286 return __match_mask;
287}
288#endif
289#undef __gpu_match_any_u32_impl
290
291// Returns a bitmask marking all lanes that have the same value of __x.
292#ifndef __gpu_match_any_u64_impl
293_DEFAULT_FN_ATTRS static __inline__ uint64_t
294__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
295 uint64_t __match_mask = 0;
296
297 bool __done = 0;
298 for (uint64_t __active_mask = __lane_mask; __active_mask;
299 __active_mask = __gpu_ballot(__lane_mask, !__done)) {
300 if (!__done) {
301 uint64_t __first = __gpu_shuffle_idx_u64(
302 __active_mask, __builtin_ctzg(__active_mask), __x, __gpu_num_lanes());
303 uint64_t __ballot = __gpu_ballot(__active_mask, __first == __x);
304 if (__first == __x) {
305 __match_mask = __ballot;
306 __done = 1;
307 }
308 }
309 }
310 return __match_mask;
311}
312#endif
313#undef __gpu_match_any_u64_impl
314
315// Returns the current lane mask if every lane contains __x.
316#ifndef __gpu_match_all_u32_impl
317_DEFAULT_FN_ATTRS static __inline__ uint64_t
318__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
319 uint32_t __first = __gpu_shuffle_idx_u32(
320 __lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
321 uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
322 return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
323}
324#endif
325#undef __gpu_match_all_u32_impl
326
327// Returns the current lane mask if every lane contains __x.
328#ifndef __gpu_match_all_u64_impl
329_DEFAULT_FN_ATTRS static __inline__ uint64_t
330__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
331 uint64_t __first = __gpu_shuffle_idx_u64(
332 __lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
333 uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
334 return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
335}
336#endif
337#undef __gpu_match_all_u64_impl
338
339_Pragma("omp end declare variant");
340_Pragma("omp end declare target");
341
342#if !defined(__cplusplus)
343_Pragma("pop_macro(\"bool\")");
344#endif
345
346#undef _DEFAULT_FN_ATTRS
347
348#endif // __GPUINTRIN_H
__DEVICE__ unsigned int __ballot(int __a)
__DEVICE__ int min(int __a, int __b)
__DEVICE__ int max(int __a, int __b)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
#define _DEFAULT_FN_ATTRS
#define __GPU_OP(__x, __y)
Definition gpuintrin.h:223
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id(int __dim)
Definition gpuintrin.h:70
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x)
Definition gpuintrin.h:270
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:125
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, uint32_t __width)
Definition gpuintrin.h:150
static _DEFAULT_FN_ATTRS __inline__ double __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x)
Definition gpuintrin.h:142
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_first_in_lane(uint64_t __lane_mask)
Definition gpuintrin.h:119
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads(int __dim)
Definition gpuintrin.h:84
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id(int __dim)
Definition gpuintrin.h:98
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x)
Definition gpuintrin.h:318
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_first_lane_id(uint64_t __lane_mask)
Definition gpuintrin.h:113
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:330
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:294
static _DEFAULT_FN_ATTRS __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, uint32_t __width)
Definition gpuintrin.h:161
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks(int __dim)
Definition gpuintrin.h:56
static _DEFAULT_FN_ATTRS __inline__ float __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x)
Definition gpuintrin.h:134
#define __DO_LANE_OPS(__type, __op, __identity, __prefix, __suffix)
Definition gpuintrin.h:184
static _DEFAULT_FN_ATTRS __inline__ double __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, uint32_t __width)
Definition gpuintrin.h:170
_Pragma("push_macro(\"bool\")")
#define or
Definition iso646.h:24
#define xor
Definition iso646.h:26
#define and
Definition iso646.h:17