9#ifndef __AMDGPUINTRIN_H
10#define __AMDGPUINTRIN_H
13#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
18#if !defined(__cplusplus)
23_Pragma(
"omp begin declare target device_type(nohost)");
24_Pragma(
"omp begin declare variant match(device = {arch(amdgcn)})");
27#define __gpu_private __attribute__((address_space(5)))
28#define __gpu_constant __attribute__((address_space(4)))
29#define __gpu_local __attribute__((address_space(3)))
30#define __gpu_global __attribute__((address_space(1)))
31#define __gpu_generic __attribute__((address_space(0)))
34#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
38 return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
43 return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
48 return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
53 return __builtin_amdgcn_workgroup_id_x();
58 return __builtin_amdgcn_workgroup_id_y();
63 return __builtin_amdgcn_workgroup_id_z();
68 return __builtin_amdgcn_workgroup_size_x();
73 return __builtin_amdgcn_workgroup_size_y();
78 return __builtin_amdgcn_workgroup_size_z();
83 return __builtin_amdgcn_workitem_id_x();
88 return __builtin_amdgcn_workitem_id_y();
93 return __builtin_amdgcn_workitem_id_z();
99 return __builtin_amdgcn_wavefrontsize();
104 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
109 return __builtin_amdgcn_read_exec();
115 return __builtin_amdgcn_readfirstlane(__x);
121 uint32_t __hi = (uint32_t)(__x >> 32ull);
122 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
123 return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
124 ((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
132 return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
137 __builtin_amdgcn_s_barrier();
138 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"workgroup");
143 __builtin_amdgcn_wave_barrier();
149 return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
155 uint32_t __hi = (uint32_t)(__x >> 32ull);
156 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
157 return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
158 ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
163 return __builtin_amdgcn_is_shared((
void __attribute__((address_space(0))) *)((
175 __builtin_amdgcn_endpgm();
180 __builtin_amdgcn_s_sleep(2);
186#if !defined(__cplusplus)
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static _DEFAULT_FN_ATTRS __inline__ void __gpu_thread_suspend(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_private(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_local(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
_DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_exit(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_threads(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
_Pragma("push_macro(\"bool\")")
#define _DEFAULT_FN_ATTRS