10#define __NVPTXINTRIN_H
13#error "This file is intended for NVPTX targets or offloading to NVPTX"
17#error "Never use <nvptxintrin.h> directly; include <gpuintrin.h> instead"
21#define __CUDA_ARCH__ 0
24_Pragma(
"omp begin declare target device_type(nohost)");
25_Pragma(
"omp begin declare variant match(device = {arch(nvptx64)})");
28#define __gpu_private __attribute__((address_space(5)))
29#define __gpu_constant __attribute__((address_space(4)))
30#define __gpu_local __attribute__((address_space(3)))
31#define __gpu_global __attribute__((address_space(1)))
32#define __gpu_generic __attribute__((address_space(0)))
35#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
39 return __nvvm_read_ptx_sreg_nctaid_x();
44 return __nvvm_read_ptx_sreg_nctaid_y();
49 return __nvvm_read_ptx_sreg_nctaid_z();
54 return __nvvm_read_ptx_sreg_ctaid_x();
59 return __nvvm_read_ptx_sreg_ctaid_y();
64 return __nvvm_read_ptx_sreg_ctaid_z();
69 return __nvvm_read_ptx_sreg_ntid_x();
74 return __nvvm_read_ptx_sreg_ntid_y();
79 return __nvvm_read_ptx_sreg_ntid_z();
84 return __nvvm_read_ptx_sreg_tid_x();
89 return __nvvm_read_ptx_sreg_tid_y();
94 return __nvvm_read_ptx_sreg_tid_z();
99 return __nvvm_read_ptx_sreg_warpsize();
104 return __nvvm_read_ptx_sreg_laneid();
109 return __nvvm_activemask();
115 uint32_t __mask = (uint32_t)__lane_mask;
116 uint32_t __id = __builtin_ffs(__mask) - 1;
117 return __nvvm_shfl_sync_idx_i32(__mask, __x, __id,
__gpu_num_lanes() - 1);
123 uint32_t __mask = (uint32_t)__lane_mask;
124 return __nvvm_vote_ballot_sync(__mask, __x);
134 __nvvm_bar_warp_sync((uint32_t)__lane_mask);
142 uint32_t __mask = (uint32_t)__lane_mask;
143 bool __bitmask = (1ull << __idx) & __lane_mask;
145 __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
153#if __CUDA_ARCH__ >= 700
154 return __nvvm_match_any_sync_i32(__lane_mask, __x);
164#if __CUDA_ARCH__ >= 700
165 return __nvvm_match_any_sync_i64(__lane_mask, __x);
175#if __CUDA_ARCH__ >= 700
177 return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
187#if __CUDA_ARCH__ >= 700
189 return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
197 return __nvvm_isspacep_shared(ptr);
202 return __nvvm_isspacep_local(ptr);
212 if (__nvvm_reflect(
"__CUDA_ARCH") >= 700)
213 asm(
"nanosleep.u32 64;" :::
"memory");
#define _DEFAULT_FN_ATTRS
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_thread_suspend(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_private(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_local(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
_Pragma("omp begin declare target device_type(nohost)")
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_exit(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_threads(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)