clang 23.0.0git
gpuintrin.h
Go to the documentation of this file.
1//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Provides wrappers around the clang builtins for accessing GPU hardware
10// features. The interface is intended to be portable between architectures, but
11// some targets may provide different implementations. This header can be
12// included for all the common GPU programming languages, namely OpenMP, HIP,
13// CUDA, and OpenCL.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef __GPUINTRIN_H
18#define __GPUINTRIN_H
19
20#if !defined(_DEFAULT_FN_ATTRS)
21#if defined(__HIP__) || defined(__CUDA__)
22#define _DEFAULT_FN_ATTRS __attribute__((device))
23#else
24#define _DEFAULT_FN_ATTRS
25#endif
26#endif
27
28#include <stdint.h>
29
30#if !defined(__cplusplus)
31_Pragma("push_macro(\"bool\")");
32#define bool _Bool
33#endif
34
35_Pragma("omp begin declare target device_type(nohost)");
36_Pragma("omp begin declare variant match(device = {kind(gpu)})");
37
38// Forward declare a few functions for the implementation header.
39
40// Returns a bitmask marking all lanes that have the same value of __x.
41_DEFAULT_FN_ATTRS static __inline__ uint64_t
42__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);
43
44// Returns a bitmask marking all lanes that have the same value of __x.
45_DEFAULT_FN_ATTRS static __inline__ uint64_t
46__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x);
47
48// Returns the current lane mask if every lane contains __x.
49_DEFAULT_FN_ATTRS static __inline__ uint64_t
50__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x);
51
52// Returns the current lane mask if every lane contains __x.
53_DEFAULT_FN_ATTRS static __inline__ uint64_t
54__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
55
56_Pragma("omp end declare variant");
57_Pragma("omp end declare target");
58
59#if defined(__NVPTX__)
60#include <nvptxintrin.h>
61#elif defined(__AMDGPU__)
62#include <amdgpuintrin.h>
63#elif defined(__SPIRV__)
64#include <spirvintrin.h>
65#elif !defined(_OPENMP)
66#error "This header is only meant to be used on GPU architectures."
67#endif
68
69_Pragma("omp begin declare target device_type(nohost)");
70_Pragma("omp begin declare variant match(device = {kind(gpu)})");
71
72// Attribute to declare a function as a kernel.
73#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
74
75#define __GPU_X_DIM 0
76#define __GPU_Y_DIM 1
77#define __GPU_Z_DIM 2
78
79// Returns the number of blocks in the requested dimension.
80_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
81 switch (__dim) {
82 case 0:
83 return __gpu_num_blocks_x();
84 case 1:
85 return __gpu_num_blocks_y();
86 case 2:
87 return __gpu_num_blocks_z();
88 default:
89 return 1;
90 }
91}
92
93// Returns the number of block id in the requested dimension.
94_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
95 switch (__dim) {
96 case 0:
97 return __gpu_block_id_x();
98 case 1:
99 return __gpu_block_id_y();
100 case 2:
101 return __gpu_block_id_z();
102 default:
103 return 0;
104 }
105}
106
107// Returns the number of threads in the requested dimension.
108_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
109 switch (__dim) {
110 case 0:
111 return __gpu_num_threads_x();
112 case 1:
113 return __gpu_num_threads_y();
114 case 2:
115 return __gpu_num_threads_z();
116 default:
117 return 1;
118 }
119}
120
121// Returns the thread id in the requested dimension.
122_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
123 switch (__dim) {
124 case 0:
125 return __gpu_thread_id_x();
126 case 1:
127 return __gpu_thread_id_y();
128 case 2:
129 return __gpu_thread_id_z();
130 default:
131 return 0;
132 }
133}
134
135// Get the first active thread inside the lane.
136_DEFAULT_FN_ATTRS static __inline__ uint64_t
137__gpu_first_lane_id(uint64_t __lane_mask) {
138 return __builtin_ffsll(__lane_mask) - 1;
139}
140
141// Conditional that is only true for a single thread in a lane.
142_DEFAULT_FN_ATTRS static __inline__ bool
143__gpu_is_first_in_lane(uint64_t __lane_mask) {
144 return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
145}
146
147// Copies the value from the first active thread to the rest.
148_DEFAULT_FN_ATTRS static __inline__ uint64_t
149__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
150 uint32_t __hi = (uint32_t)(__x >> 32);
151 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
152 return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32) |
153 ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) & 0xFFFFFFFF);
154}
155
156// Gets the first floating point value from the active lanes.
157_DEFAULT_FN_ATTRS static __inline__ float
158__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
159 return __builtin_bit_cast(
160 float, __gpu_read_first_lane_u32(__lane_mask,
161 __builtin_bit_cast(uint32_t, __x)));
162}
163
164// Gets the first floating point value from the active lanes.
165_DEFAULT_FN_ATTRS static __inline__ double
166__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
167 return __builtin_bit_cast(
168 double, __gpu_read_first_lane_u64(__lane_mask,
169 __builtin_bit_cast(uint64_t, __x)));
170}
171
172// Shuffles the the lanes according to the given index.
173_DEFAULT_FN_ATTRS static __inline__ uint64_t
174__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
175 uint32_t __width) {
176 uint32_t __hi = (uint32_t)(__x >> 32);
177 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
178 uint32_t __mask = (uint32_t)__lane_mask;
179 return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) << 32) |
180 ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
181}
182
183// Shuffles the the lanes according to the given index.
184_DEFAULT_FN_ATTRS static __inline__ float
185__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
186 uint32_t __width) {
187 return __builtin_bit_cast(
188 float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
189 __builtin_bit_cast(uint32_t, __x), __width));
190}
191
192// Shuffles the the lanes according to the given index.
193_DEFAULT_FN_ATTRS static __inline__ double
194__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
195 uint32_t __width) {
196 return __builtin_bit_cast(
197 double,
198 __gpu_shuffle_idx_u64(__lane_mask, __idx,
199 __builtin_bit_cast(uint64_t, __x), __width));
200}
201
202// Implements scan and reduction operations across a GPU warp or wavefront.
203//
204// Both scans work by iterating log2(N) steps. The bitmask tracks the currently
205// unprocessed lanes, above or below the current lane in the case of a suffix or
206// prefix scan. Each iteration we shuffle in the unprocessed neighbors and then
207// clear the bits that this operation handled.
208#define __DO_LANE_OPS(__type, __op, __identity, __prefix, __suffix) \
209 _DEFAULT_FN_ATTRS static __inline__ __type \
210 __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
211 __type __x) { \
212 uint64_t __above = __lane_mask & -(UINT64_C(2) << __gpu_lane_id()); \
213 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
214 uint32_t __src = __builtin_ctzg(__above, (int)sizeof(__above) * 8); \
215 __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
216 __gpu_num_lanes()); \
217 __x = __op(__x, __above ? __result : (__type)__identity); \
218 for (uint32_t __i = 0; __i < __step; ++__i) \
219 __above &= __above - 1; \
220 } \
221 return __x; \
222 } \
223 \
224 _DEFAULT_FN_ATTRS static __inline__ __type \
225 __gpu_prefix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
226 __type __x) { \
227 uint64_t __below = __lane_mask & ((UINT64_C(1) << __gpu_lane_id()) - 1); \
228 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
229 uint32_t __src = 63 - __builtin_clzg(__below, (int)sizeof(__below) * 8); \
230 __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
231 __gpu_num_lanes()); \
232 __x = __op(__x, __below ? __result : (__type)__identity); \
233 for (uint32_t __i = 0; __i < __step; ++__i) \
234 __below ^= \
235 (UINT64_C(1) << (63 - __builtin_clzg(__below, 0))) & __below; \
236 } \
237 return __x; \
238 } \
239 \
240 _DEFAULT_FN_ATTRS static __inline__ __type \
241 __gpu_lane_##__prefix##_##__suffix(uint64_t __lane_mask, __type __x) { \
242 return __gpu_read_first_lane_##__suffix( \
243 __lane_mask, \
244 __gpu_suffix_scan_##__prefix##_##__suffix(__lane_mask, __x)); \
245 }
246
247#define __GPU_OP(__x, __y) ((__x) + (__y))
248__DO_LANE_OPS(uint32_t, __GPU_OP, 0, add, u32);
249__DO_LANE_OPS(uint64_t, __GPU_OP, 0, add, u64);
250__DO_LANE_OPS(float, __GPU_OP, 0, add, f32);
251__DO_LANE_OPS(double, __GPU_OP, 0, add, f64);
252#undef __GPU_OP
253
254#define __GPU_OP(__x, __y) ((__x) & (__y))
255__DO_LANE_OPS(uint32_t, __GPU_OP, UINT32_MAX, and, u32);
256__DO_LANE_OPS(uint64_t, __GPU_OP, UINT64_MAX, and, u64);
257#undef __GPU_OP
258
259#define __GPU_OP(__x, __y) ((__x) | (__y))
260__DO_LANE_OPS(uint32_t, __GPU_OP, 0, or, u32);
261__DO_LANE_OPS(uint64_t, __GPU_OP, 0, or, u64);
262#undef __GPU_OP
263
264#define __GPU_OP(__x, __y) ((__x) ^ (__y))
265__DO_LANE_OPS(uint32_t, __GPU_OP, 0, xor, u32);
266__DO_LANE_OPS(uint64_t, __GPU_OP, 0, xor, u64);
267#undef __GPU_OP
268
269#define __GPU_OP(__x, __y) ((__x) < (__y) ? (__x) : (__y))
270__DO_LANE_OPS(uint32_t, __GPU_OP, UINT32_MAX, min, u32);
271__DO_LANE_OPS(uint64_t, __GPU_OP, UINT64_MAX, min, u64);
272#undef __GPU_OP
273
274#define __GPU_OP(__x, __y) ((__x) > (__y) ? (__x) : (__y))
275__DO_LANE_OPS(uint32_t, __GPU_OP, 0, max, u32);
276__DO_LANE_OPS(uint64_t, __GPU_OP, 0, max, u64);
277#undef __GPU_OP
278
279#define __GPU_OP(__x, __y) __builtin_elementwise_minnum((__x), (__y))
280__DO_LANE_OPS(float, __GPU_OP, __builtin_inff(), minnum, f32);
281__DO_LANE_OPS(double, __GPU_OP, __builtin_inf(), minnum, f64);
282#undef __GPU_OP
283
284#define __GPU_OP(__x, __y) __builtin_elementwise_maxnum((__x), (__y))
285__DO_LANE_OPS(float, __GPU_OP, -__builtin_inff(), maxnum, f32);
286__DO_LANE_OPS(double, __GPU_OP, -__builtin_inf(), maxnum, f64);
287#undef __GPU_OP
288
289#undef __DO_LANE_OPS
290
291// Returns a bitmask marking all lanes that have the same value of __x.
292_DEFAULT_FN_ATTRS static __inline__ uint64_t
293__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
294 uint64_t __match_mask = 0;
295
296 bool __done = 0;
297 for (uint64_t __active_mask = __lane_mask; __active_mask;
298 __active_mask = __gpu_ballot(__lane_mask, !__done)) {
299 if (!__done) {
300 uint32_t __first = __gpu_shuffle_idx_u32(
301 __active_mask, __builtin_ctzg(__active_mask), __x, __gpu_num_lanes());
302 uint64_t __ballot = __gpu_ballot(__active_mask, __first == __x);
303 if (__first == __x) {
304 __match_mask = __ballot;
305 __done = 1;
306 }
307 }
308 }
309 return __match_mask;
310}
311
312// Returns a bitmask marking all lanes that have the same value of __x.
313_DEFAULT_FN_ATTRS static __inline__ uint64_t
314__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
315 uint64_t __match_mask = 0;
316
317 bool __done = 0;
318 for (uint64_t __active_mask = __lane_mask; __active_mask;
319 __active_mask = __gpu_ballot(__lane_mask, !__done)) {
320 if (!__done) {
321 uint64_t __first = __gpu_shuffle_idx_u64(
322 __active_mask, __builtin_ctzg(__active_mask), __x, __gpu_num_lanes());
323 uint64_t __ballot = __gpu_ballot(__active_mask, __first == __x);
324 if (__first == __x) {
325 __match_mask = __ballot;
326 __done = 1;
327 }
328 }
329 }
330 return __match_mask;
331}
332
333// Returns the current lane mask if every lane contains __x.
334_DEFAULT_FN_ATTRS static __inline__ uint64_t
335__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
336 uint32_t __first = __gpu_shuffle_idx_u32(
337 __lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
338 uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
339 return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
340}
341
342// Returns the current lane mask if every lane contains __x.
343_DEFAULT_FN_ATTRS static __inline__ uint64_t
344__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
345 uint64_t __first = __gpu_shuffle_idx_u64(
346 __lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
347 uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
348 return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
349}
350
351_Pragma("omp end declare variant");
352_Pragma("omp end declare target");
353
354#if !defined(__cplusplus)
355_Pragma("pop_macro(\"bool\")");
356#endif
357
358#undef _DEFAULT_FN_ATTRS
359
360#endif // __GPUINTRIN_H
__DEVICE__ unsigned int __ballot(int __a)
__DEVICE__ int min(int __a, int __b)
__DEVICE__ int max(int __a, int __b)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
#define _DEFAULT_FN_ATTRS
#define __GPU_OP(__x, __y)
Definition gpuintrin.h:247
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id(int __dim)
Definition gpuintrin.h:94
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:149
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x)
Definition gpuintrin.h:293
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, uint32_t __width)
Definition gpuintrin.h:174
static _DEFAULT_FN_ATTRS __inline__ double __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x)
Definition gpuintrin.h:166
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:344
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x)
Definition gpuintrin.h:335
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_first_in_lane(uint64_t __lane_mask)
Definition gpuintrin.h:143
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads(int __dim)
Definition gpuintrin.h:108
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id(int __dim)
Definition gpuintrin.h:122
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_first_lane_id(uint64_t __lane_mask)
Definition gpuintrin.h:137
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:314
static _DEFAULT_FN_ATTRS __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, uint32_t __width)
Definition gpuintrin.h:185
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks(int __dim)
Definition gpuintrin.h:80
static _DEFAULT_FN_ATTRS __inline__ float __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x)
Definition gpuintrin.h:158
#define __DO_LANE_OPS(__type, __op, __identity, __prefix, __suffix)
Definition gpuintrin.h:208
static _DEFAULT_FN_ATTRS __inline__ double __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, uint32_t __width)
Definition gpuintrin.h:194
_Pragma("push_macro(\"bool\")")
#define or
Definition iso646.h:24
#define xor
Definition iso646.h:26
#define and
Definition iso646.h:17