clang 20.0.0git
gpuintrin.h
Go to the documentation of this file.
1//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Provides wrappers around the clang builtins for accessing GPU hardware
10// features. The interface is intended to be portable between architectures, but
11// some targets may provide different implementations. This header can be
12// included for all the common GPU programming languages, namely OpenMP, HIP,
13// CUDA, and OpenCL.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef __GPUINTRIN_H
18#define __GPUINTRIN_H
19
20#if !defined(_DEFAULT_FN_ATTRS)
21#if defined(__HIP__) || defined(__CUDA__)
22#define _DEFAULT_FN_ATTRS __attribute__((device))
23#else
24#define _DEFAULT_FN_ATTRS
25#endif
26#endif
27
28#if defined(__NVPTX__)
29#include <nvptxintrin.h>
30#elif defined(__AMDGPU__)
31#include <amdgpuintrin.h>
32#elif !defined(_OPENMP)
33#error "This header is only meant to be used on GPU architectures."
34#endif
35
36#include <stdint.h>
37
38#if !defined(__cplusplus)
39_Pragma("push_macro(\"bool\")");
40#define bool _Bool
41#endif
42
43_Pragma("omp begin declare target device_type(nohost)");
44_Pragma("omp begin declare variant match(device = {kind(gpu)})");
45
46#define __GPU_X_DIM 0
47#define __GPU_Y_DIM 1
48#define __GPU_Z_DIM 2
49
50// Returns the number of blocks in the requested dimension.
51_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
52 switch (__dim) {
53 case 0:
54 return __gpu_num_blocks_x();
55 case 1:
56 return __gpu_num_blocks_y();
57 case 2:
58 return __gpu_num_blocks_z();
59 default:
60 __builtin_unreachable();
61 }
62}
63
64// Returns the number of block id in the requested dimension.
65_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
66 switch (__dim) {
67 case 0:
68 return __gpu_block_id_x();
69 case 1:
70 return __gpu_block_id_y();
71 case 2:
72 return __gpu_block_id_z();
73 default:
74 __builtin_unreachable();
75 }
76}
77
78// Returns the number of threads in the requested dimension.
79_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
80 switch (__dim) {
81 case 0:
82 return __gpu_num_threads_x();
83 case 1:
84 return __gpu_num_threads_y();
85 case 2:
86 return __gpu_num_threads_z();
87 default:
88 __builtin_unreachable();
89 }
90}
91
92// Returns the thread id in the requested dimension.
93_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
94 switch (__dim) {
95 case 0:
96 return __gpu_thread_id_x();
97 case 1:
98 return __gpu_thread_id_y();
99 case 2:
100 return __gpu_thread_id_z();
101 default:
102 __builtin_unreachable();
103 }
104}
105
106// Get the first active thread inside the lane.
107_DEFAULT_FN_ATTRS static __inline__ uint64_t
108__gpu_first_lane_id(uint64_t __lane_mask) {
109 return __builtin_ffsll(__lane_mask) - 1;
110}
111
112// Conditional that is only true for a single thread in a lane.
113_DEFAULT_FN_ATTRS static __inline__ bool
114__gpu_is_first_in_lane(uint64_t __lane_mask) {
115 return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
116}
117
118// Gets the first floating point value from the active lanes.
119_DEFAULT_FN_ATTRS static __inline__ float
120__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
121 return __builtin_bit_cast(
122 float, __gpu_read_first_lane_u32(__lane_mask,
123 __builtin_bit_cast(uint32_t, __x)));
124}
125
126// Gets the first floating point value from the active lanes.
127_DEFAULT_FN_ATTRS static __inline__ double
128__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
129 return __builtin_bit_cast(
130 double, __gpu_read_first_lane_u64(__lane_mask,
131 __builtin_bit_cast(uint64_t, __x)));
132}
133
134// Shuffles the the lanes according to the given index.
135_DEFAULT_FN_ATTRS static __inline__ float
136__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
137 return __builtin_bit_cast(
138 float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
139 __builtin_bit_cast(uint32_t, __x)));
140}
141
142// Shuffles the the lanes according to the given index.
143_DEFAULT_FN_ATTRS static __inline__ double
144__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
145 return __builtin_bit_cast(
146 double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
147 __builtin_bit_cast(uint64_t, __x)));
148}
149
150// Gets the sum of all lanes inside the warp or wavefront.
151#define __DO_LANE_SUM(__type, __suffix) \
152 _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
153 uint64_t __lane_mask, __type __x) { \
154 for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
155 uint32_t __index = __step + __gpu_lane_id(); \
156 __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
157 } \
158 return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
159 }
160__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
161__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
162__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
163__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
164#undef __DO_LANE_SUM
165
166// Gets the accumulator scan of the threads in the warp or wavefront.
167#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
168 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
169 uint64_t __lane_mask, uint32_t __x) { \
170 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
171 uint32_t __index = __gpu_lane_id() - __step; \
172 __bitmask_type bitmask = __gpu_lane_id() >= __step; \
173 __x += __builtin_bit_cast( \
174 __type, \
175 -bitmask & __builtin_bit_cast(__bitmask_type, \
176 __gpu_shuffle_idx_##__suffix( \
177 __lane_mask, __index, __x))); \
178 } \
179 return __x; \
180 }
181__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
182__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
183__DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x)
184__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x)
185#undef __DO_LANE_SCAN
186
187_Pragma("omp end declare variant");
188_Pragma("omp end declare target");
189
190#if !defined(__cplusplus)
191_Pragma("pop_macro(\"bool\")");
192#endif
193
194#undef _DEFAULT_FN_ATTRS
195
196#endif // __GPUINTRIN_H
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
Definition: amdgpuintrin.h:82
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
Definition: amdgpuintrin.h:103
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
Definition: amdgpuintrin.h:77
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x)
Definition: amdgpuintrin.h:148
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
Definition: amdgpuintrin.h:114
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
Definition: amdgpuintrin.h:57
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
Definition: amdgpuintrin.h:92
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
Definition: amdgpuintrin.h:37
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
Definition: amdgpuintrin.h:62
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
Definition: amdgpuintrin.h:72
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x)
Definition: amdgpuintrin.h:154
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
Definition: amdgpuintrin.h:67
_DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
Definition: amdgpuintrin.h:120
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
Definition: amdgpuintrin.h:87
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
Definition: amdgpuintrin.h:47
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
Definition: amdgpuintrin.h:52
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
Definition: amdgpuintrin.h:42
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id(int __dim)
Definition: gpuintrin.h:65
static _DEFAULT_FN_ATTRS __inline__ double __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x)
Definition: gpuintrin.h:144
#define _DEFAULT_FN_ATTRS
Definition: gpuintrin.h:24
static _DEFAULT_FN_ATTRS __inline__ double __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x)
Definition: gpuintrin.h:128
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_first_in_lane(uint64_t __lane_mask)
Definition: gpuintrin.h:114
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads(int __dim)
Definition: gpuintrin.h:79
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id(int __dim)
Definition: gpuintrin.h:93
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_first_lane_id(uint64_t __lane_mask)
Definition: gpuintrin.h:108
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)
Definition: gpuintrin.h:167
#define __DO_LANE_SUM(__type, __suffix)
Definition: gpuintrin.h:151
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks(int __dim)
Definition: gpuintrin.h:51
static _DEFAULT_FN_ATTRS __inline__ float __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x)
Definition: gpuintrin.h:120
static _DEFAULT_FN_ATTRS __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x)
Definition: gpuintrin.h:136
_Pragma("push_macro(\"bool\")")
unsigned long uint64_t