20#if !defined(_DEFAULT_FN_ATTRS)
21#if defined(__HIP__) || defined(__CUDA__)
22#define _DEFAULT_FN_ATTRS __attribute__((device))
24#define _DEFAULT_FN_ATTRS
30#elif defined(__AMDGPU__)
32#elif !defined(_OPENMP)
33#error "This header is only meant to be used on GPU architectures."
38#if !defined(__cplusplus)
43_Pragma(
"omp begin declare target device_type(nohost)");
44_Pragma(
"omp begin declare variant match(device = {kind(gpu)})");
60 __builtin_unreachable();
74 __builtin_unreachable();
88 __builtin_unreachable();
102 __builtin_unreachable();
109 return __builtin_ffsll(__lane_mask) - 1;
121 return __builtin_bit_cast(
123 __builtin_bit_cast(uint32_t, __x)));
129 return __builtin_bit_cast(
131 __builtin_bit_cast(uint64_t, __x)));
137 return __builtin_bit_cast(
139 __builtin_bit_cast(uint32_t, __x)));
145 return __builtin_bit_cast(
147 __builtin_bit_cast(uint64_t, __x)));
151#define __DO_LANE_SUM(__type, __suffix) \
152 _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
153 uint64_t __lane_mask, __type __x) { \
154 for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
155 uint32_t __index = __step + __gpu_lane_id(); \
156 __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
158 return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
167#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
168 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
169 uint64_t __lane_mask, uint32_t __x) { \
170 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
171 uint32_t __index = __gpu_lane_id() - __step; \
172 __bitmask_type bitmask = __gpu_lane_id() >= __step; \
173 __x += __builtin_bit_cast( \
175 -bitmask & __builtin_bit_cast(__bitmask_type, \
176 __gpu_shuffle_idx_##__suffix( \
177 __lane_mask, __index, __x))); \
190#if !defined(__cplusplus)
194#undef _DEFAULT_FN_ATTRS
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
_DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id(int __dim)
static _DEFAULT_FN_ATTRS __inline__ double __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x)
#define _DEFAULT_FN_ATTRS
static _DEFAULT_FN_ATTRS __inline__ double __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_first_in_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads(int __dim)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id(int __dim)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_first_lane_id(uint64_t __lane_mask)
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)
#define __DO_LANE_SUM(__type, __suffix)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks(int __dim)
static _DEFAULT_FN_ATTRS __inline__ float __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x)
static _DEFAULT_FN_ATTRS __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x)
_Pragma("push_macro(\"bool\")")