10#define __NVPTXINTRIN_H
13#error "This file is intended for NVPTX targets or offloading to NVPTX"
17#error "Never use <nvptxintrin.h> directly; include <gpuintrin.h> instead"
21#define __CUDA_ARCH__ 0
24_Pragma(
"omp begin declare target device_type(nohost)");
25_Pragma(
"omp begin declare variant match(device = {arch(nvptx64)})");
28#define __gpu_private __attribute__((address_space(5)))
29#define __gpu_constant __attribute__((address_space(4)))
30#define __gpu_local __attribute__((address_space(3)))
31#define __gpu_global __attribute__((address_space(1)))
32#define __gpu_generic __attribute__((address_space(0)))
36 return __nvvm_read_ptx_sreg_nctaid_x();
41 return __nvvm_read_ptx_sreg_nctaid_y();
46 return __nvvm_read_ptx_sreg_nctaid_z();
51 return __nvvm_read_ptx_sreg_ctaid_x();
56 return __nvvm_read_ptx_sreg_ctaid_y();
61 return __nvvm_read_ptx_sreg_ctaid_z();
66 return __nvvm_read_ptx_sreg_ntid_x();
71 return __nvvm_read_ptx_sreg_ntid_y();
76 return __nvvm_read_ptx_sreg_ntid_z();
81 return __nvvm_read_ptx_sreg_tid_x();
86 return __nvvm_read_ptx_sreg_tid_y();
91 return __nvvm_read_ptx_sreg_tid_z();
96 return __nvvm_read_ptx_sreg_warpsize();
101 return __nvvm_read_ptx_sreg_laneid();
106 return __nvvm_activemask();
112 uint32_t __mask = (uint32_t)__lane_mask;
113 uint32_t __id = __builtin_ffs(__mask) - 1;
114 return __nvvm_shfl_sync_idx_i32(__mask, __x, __id,
__gpu_num_lanes() - 1);
120 uint32_t __mask = (uint32_t)__lane_mask;
121 return __nvvm_vote_ballot_sync(__mask, __x);
131 __nvvm_bar_warp_sync((uint32_t)__lane_mask);
139 uint32_t __mask = (uint32_t)__lane_mask;
140 bool __bitmask = (1ull << __idx) & __lane_mask;
142 __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
150#if __CUDA_ARCH__ >= 700
151 return __nvvm_match_any_sync_i32(__lane_mask, __x);
161#if __CUDA_ARCH__ >= 700
162 return __nvvm_match_any_sync_i64(__lane_mask, __x);
172#if __CUDA_ARCH__ >= 700
174 return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
184#if __CUDA_ARCH__ >= 700
186 return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
194 return __nvvm_isspacep_shared(ptr);
199 return __nvvm_isspacep_local(ptr);
209 if (__nvvm_reflect(
"__CUDA_ARCH") >= 700)
210 asm(
"nanosleep.u32 64;" :::
"memory");
#define _DEFAULT_FN_ATTRS
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_thread_suspend(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_private(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_local(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
_Pragma("omp begin declare target device_type(nohost)")
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_exit(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_threads(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)