clang 23.0.0git
nvptxintrin.h
Go to the documentation of this file.
1//===-- nvptxintrin.h - NVPTX intrinsic functions -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef __NVPTXINTRIN_H
10#define __NVPTXINTRIN_H
11
12#ifndef __NVPTX__
13#error "This file is intended for NVPTX targets or offloading to NVPTX"
14#endif
15
16#ifndef __GPUINTRIN_H
17#error "Never use <nvptxintrin.h> directly; include <gpuintrin.h> instead"
18#endif
19
20#ifndef __CUDA_ARCH__
21#define __CUDA_ARCH__ 0
22#endif
23
24_Pragma("omp begin declare target device_type(nohost)");
25_Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
26
27// Type aliases to the address spaces used by the NVPTX backend.
28#define __gpu_private __attribute__((address_space(5)))
29#define __gpu_constant __attribute__((address_space(4)))
30#define __gpu_local __attribute__((address_space(3)))
31#define __gpu_global __attribute__((address_space(1)))
32#define __gpu_generic __attribute__((address_space(0)))
33
34// Returns the number of CUDA blocks in the 'x' dimension.
35_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
36 return __nvvm_read_ptx_sreg_nctaid_x();
37}
38
39// Returns the number of CUDA blocks in the 'y' dimension.
40_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
41 return __nvvm_read_ptx_sreg_nctaid_y();
42}
43
44// Returns the number of CUDA blocks in the 'z' dimension.
45_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
46 return __nvvm_read_ptx_sreg_nctaid_z();
47}
48
49// Returns the 'x' dimension of the current CUDA block's id.
50_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
51 return __nvvm_read_ptx_sreg_ctaid_x();
52}
53
54// Returns the 'y' dimension of the current CUDA block's id.
55_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
56 return __nvvm_read_ptx_sreg_ctaid_y();
57}
58
59// Returns the 'z' dimension of the current CUDA block's id.
60_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
61 return __nvvm_read_ptx_sreg_ctaid_z();
62}
63
64// Returns the number of CUDA threads in the 'x' dimension.
65_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
66 return __nvvm_read_ptx_sreg_ntid_x();
67}
68
69// Returns the number of CUDA threads in the 'y' dimension.
70_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
71 return __nvvm_read_ptx_sreg_ntid_y();
72}
73
74// Returns the number of CUDA threads in the 'z' dimension.
75_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
76 return __nvvm_read_ptx_sreg_ntid_z();
77}
78
79// Returns the 'x' dimension id of the thread in the current CUDA block.
80_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
81 return __nvvm_read_ptx_sreg_tid_x();
82}
83
84// Returns the 'y' dimension id of the thread in the current CUDA block.
85_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
86 return __nvvm_read_ptx_sreg_tid_y();
87}
88
89// Returns the 'z' dimension id of the thread in the current CUDA block.
90_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
91 return __nvvm_read_ptx_sreg_tid_z();
92}
93
94// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
95_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
96 return __nvvm_read_ptx_sreg_warpsize();
97}
98
99// Returns the id of the thread inside of a CUDA warp executing together.
100_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
101 return __nvvm_read_ptx_sreg_laneid();
102}
103
104// Returns the bit-mask of active threads in the current warp.
105_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
106 return __nvvm_activemask();
107}
108
109// Copies the value from the first active thread in the warp to the rest.
110_DEFAULT_FN_ATTRS static __inline__ uint32_t
111__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
112 uint32_t __mask = (uint32_t)__lane_mask;
113 uint32_t __id = __builtin_ffs(__mask) - 1;
114 return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
115}
116
117// Returns a bitmask of threads in the current lane for which \p x is true.
118_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
119 bool __x) {
120 uint32_t __mask = (uint32_t)__lane_mask;
121 return __nvvm_vote_ballot_sync(__mask, __x);
122}
123
124// Waits for all the threads in the block to converge and issues a fence.
125_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
126 __syncthreads();
127}
128
129// Waits for all threads in the warp to reconverge for independent scheduling.
130_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
131 __nvvm_bar_warp_sync((uint32_t)__lane_mask);
132}
133
134// Shuffles the the lanes inside the warp according to the given index.
135_DEFAULT_FN_ATTRS static __inline__ uint32_t
136__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
137 uint32_t __width) {
138 // Mask out inactive lanes to match AMDGPU behavior.
139 uint32_t __mask = (uint32_t)__lane_mask;
140 bool __bitmask = (1ull << __idx) & __lane_mask;
141 return -__bitmask &
142 __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
143 ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
144}
145
146// Returns a bitmask marking all lanes that have the same value of __x.
147_DEFAULT_FN_ATTRS static __inline__ uint64_t
148__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
149 // Newer targets can use the dedicated CUDA support.
150#if __CUDA_ARCH__ >= 700
151 return __nvvm_match_any_sync_i32(__lane_mask, __x);
152#else
153 return __gpu_match_any_u32_impl(__lane_mask, __x);
154#endif
155}
156
157// Returns a bitmask marking all lanes that have the same value of __x.
158_DEFAULT_FN_ATTRS static __inline__ uint64_t
159__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
160 // Newer targets can use the dedicated CUDA support.
161#if __CUDA_ARCH__ >= 700
162 return __nvvm_match_any_sync_i64(__lane_mask, __x);
163#else
164 return __gpu_match_any_u64_impl(__lane_mask, __x);
165#endif
166}
167
168// Returns the current lane mask if every lane contains __x.
169_DEFAULT_FN_ATTRS static __inline__ uint64_t
170__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
171 // Newer targets can use the dedicated CUDA support.
172#if __CUDA_ARCH__ >= 700
173 int predicate;
174 return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
175#else
176 return __gpu_match_all_u32_impl(__lane_mask, __x);
177#endif
178}
179
180// Returns the current lane mask if every lane contains __x.
181_DEFAULT_FN_ATTRS static __inline__ uint64_t
182__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
183 // Newer targets can use the dedicated CUDA support.
184#if __CUDA_ARCH__ >= 700
185 int predicate;
186 return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
187#else
188 return __gpu_match_all_u64_impl(__lane_mask, __x);
189#endif
190}
191
192// Returns true if the flat pointer points to CUDA 'shared' memory.
193_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
194 return __nvvm_isspacep_shared(ptr);
195}
196
197// Returns true if the flat pointer points to CUDA 'local' memory.
198_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
199 return __nvvm_isspacep_local(ptr);
200}
201
202// Terminates execution of the calling thread.
203_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
204 __nvvm_exit();
205}
206
207// Suspend the thread briefly to assist the scheduler during busy loops.
208_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
209 if (__nvvm_reflect("__CUDA_ARCH") >= 700)
210 asm("nanosleep.u32 64;" ::: "memory");
211}
212
213_Pragma("omp end declare variant");
214_Pragma("omp end declare target");
215
216#endif // __NVPTXINTRIN_H
#define _DEFAULT_FN_ATTRS
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x)
Definition gpuintrin.h:268
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:317
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x)
Definition gpuintrin.h:308
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x)
Definition gpuintrin.h:288
static _DEFAULT_FN_ATTRS __inline__ void __gpu_thread_suspend(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
Definition nvptxintrin.h:80
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
Definition nvptxintrin.h:75
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
Definition nvptxintrin.h:55
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
Definition nvptxintrin.h:90
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_private(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_local(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
Definition nvptxintrin.h:35
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
Definition nvptxintrin.h:60
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
Definition nvptxintrin.h:70
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
Definition nvptxintrin.h:65
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
Definition nvptxintrin.h:95
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
Definition nvptxintrin.h:85
_Pragma("omp begin declare target device_type(nohost)")
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
Definition nvptxintrin.h:45
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
Definition nvptxintrin.h:50
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_exit(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_threads(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
Definition nvptxintrin.h:40
#define noreturn
Definition stdnoreturn.h:17