20#if !defined(_DEFAULT_FN_ATTRS)
21#if defined(__HIP__) || defined(__CUDA__)
22#define _DEFAULT_FN_ATTRS __attribute__((device))
24#define _DEFAULT_FN_ATTRS
30#if !defined(__cplusplus)
37#elif defined(__AMDGPU__)
39#elif defined(__SPIRV__)
41#elif !defined(_OPENMP)
42#error "This header is only meant to be used on GPU architectures."
45_Pragma(
"omp begin declare target device_type(nohost)");
46_Pragma(
"omp begin declare variant match(device = {kind(gpu)})");
49#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
114 return __builtin_ffsll(__lane_mask) - 1;
126 uint32_t __hi = (uint32_t)(__x >> 32);
127 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
135 return __builtin_bit_cast(
137 __builtin_bit_cast(uint32_t, __x)));
143 return __builtin_bit_cast(
145 __builtin_bit_cast(uint64_t, __x)));
152 uint32_t __hi = (uint32_t)(__x >> 32);
153 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
154 uint32_t __mask = (uint32_t)__lane_mask;
163 return __builtin_bit_cast(
165 __builtin_bit_cast(uint32_t, __x), __width));
172 return __builtin_bit_cast(
175 __builtin_bit_cast(uint64_t, __x), __width));
184#define __DO_LANE_OPS(__type, __op, __identity, __prefix, __suffix) \
185 _DEFAULT_FN_ATTRS static __inline__ __type \
186 __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
188 uint64_t __above = __lane_mask & -(UINT64_C(2) << __gpu_lane_id()); \
189 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
190 uint32_t __src = __builtin_ctzg(__above, (int)sizeof(__above) * 8); \
191 __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
192 __gpu_num_lanes()); \
193 __x = __op(__x, __above ? __result : (__type)__identity); \
194 for (uint32_t __i = 0; __i < __step; ++__i) \
195 __above &= __above - 1; \
200 _DEFAULT_FN_ATTRS static __inline__ __type \
201 __gpu_prefix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
203 uint64_t __below = __lane_mask & ((UINT64_C(1) << __gpu_lane_id()) - 1); \
204 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
205 uint32_t __src = 63 - __builtin_clzg(__below, (int)sizeof(__below) * 8); \
206 __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
207 __gpu_num_lanes()); \
208 __x = __op(__x, __below ? __result : (__type)__identity); \
209 for (uint32_t __i = 0; __i < __step; ++__i) \
211 (UINT64_C(1) << (63 - __builtin_clzg(__below, 0))) & __below; \
216 _DEFAULT_FN_ATTRS static __inline__ __type \
217 __gpu_lane_##__prefix##_##__suffix(uint64_t __lane_mask, __type __x) { \
218 return __gpu_read_first_lane_##__suffix( \
220 __gpu_suffix_scan_##__prefix##_##__suffix(__lane_mask, __x)); \
223#define __GPU_OP(__x, __y) ((__x) + (__y))
230#define __GPU_OP(__x, __y) ((__x) & (__y))
235#define __GPU_OP(__x, __y) ((__x) | (__y))
240#define __GPU_OP(__x, __y) ((__x) ^ (__y))
245#define __GPU_OP(__x, __y) ((__x) < (__y) ? (__x) : (__y))
250#define __GPU_OP(__x, __y) ((__x) > (__y) ? (__x) : (__y))
255#define __GPU_OP(__x, __y) __builtin_elementwise_minnum((__x), (__y))
260#define __GPU_OP(__x, __y) __builtin_elementwise_maxnum((__x), (__y))
268#ifndef __gpu_match_any_u32_impl
271 uint64_t __match_mask = 0;
274 for (uint64_t __active_mask = __lane_mask; __active_mask;
280 if (__first == __x) {
289#undef __gpu_match_any_u32_impl
292#ifndef __gpu_match_any_u64_impl
295 uint64_t __match_mask = 0;
298 for (uint64_t __active_mask = __lane_mask; __active_mask;
304 if (__first == __x) {
313#undef __gpu_match_any_u64_impl
316#ifndef __gpu_match_all_u32_impl
322 return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
325#undef __gpu_match_all_u32_impl
328#ifndef __gpu_match_all_u64_impl
334 return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
337#undef __gpu_match_all_u64_impl
342#if !defined(__cplusplus)
346#undef _DEFAULT_FN_ATTRS
__DEVICE__ unsigned int __ballot(int __a)
__DEVICE__ int min(int __a, int __b)
__DEVICE__ int max(int __a, int __b)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
#define _DEFAULT_FN_ATTRS
#define __GPU_OP(__x, __y)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id(int __dim)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ double __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_first_in_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads(int __dim)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id(int __dim)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_first_lane_id(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks(int __dim)
static _DEFAULT_FN_ATTRS __inline__ float __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x)
#define __DO_LANE_OPS(__type, __op, __identity, __prefix, __suffix)
static _DEFAULT_FN_ATTRS __inline__ double __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, uint32_t __width)
_Pragma("push_macro(\"bool\")")