clang 20.0.0git
nvptxintrin.h
Go to the documentation of this file.
1//===-- nvptxintrin.h - NVPTX intrinsic functions -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef __NVPTXINTRIN_H
10#define __NVPTXINTRIN_H
11
12#ifndef __NVPTX__
13#error "This file is intended for NVPTX targets or offloading to NVPTX"
14#endif
15
16#include <stdint.h>
17
18#if !defined(__cplusplus)
19_Pragma("push_macro(\"bool\")");
20#define bool _Bool
21#endif
22
23_Pragma("omp begin declare target device_type(nohost)");
24_Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
25
26// Type aliases to the address spaces used by the NVPTX backend.
27#define __gpu_private __attribute__((address_space(5)))
28#define __gpu_constant __attribute__((address_space(4)))
29#define __gpu_local __attribute__((address_space(3)))
30#define __gpu_global __attribute__((address_space(1)))
31#define __gpu_generic __attribute__((address_space(0)))
32
33// Attribute to declare a function as a kernel.
34#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
35
36// Returns the number of CUDA blocks in the 'x' dimension.
37_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
38 return __nvvm_read_ptx_sreg_nctaid_x();
39}
40
41// Returns the number of CUDA blocks in the 'y' dimension.
42_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
43 return __nvvm_read_ptx_sreg_nctaid_y();
44}
45
46// Returns the number of CUDA blocks in the 'z' dimension.
47_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
48 return __nvvm_read_ptx_sreg_nctaid_z();
49}
50
51// Returns the 'x' dimension of the current CUDA block's id.
52_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
53 return __nvvm_read_ptx_sreg_ctaid_x();
54}
55
56// Returns the 'y' dimension of the current CUDA block's id.
57_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
58 return __nvvm_read_ptx_sreg_ctaid_y();
59}
60
61// Returns the 'z' dimension of the current CUDA block's id.
62_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
63 return __nvvm_read_ptx_sreg_ctaid_z();
64}
65
66// Returns the number of CUDA threads in the 'x' dimension.
67_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
68 return __nvvm_read_ptx_sreg_ntid_x();
69}
70
71// Returns the number of CUDA threads in the 'y' dimension.
72_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
73 return __nvvm_read_ptx_sreg_ntid_y();
74}
75
76// Returns the number of CUDA threads in the 'z' dimension.
77_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
78 return __nvvm_read_ptx_sreg_ntid_z();
79}
80
81// Returns the 'x' dimension id of the thread in the current CUDA block.
82_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
83 return __nvvm_read_ptx_sreg_tid_x();
84}
85
86// Returns the 'y' dimension id of the thread in the current CUDA block.
87_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
88 return __nvvm_read_ptx_sreg_tid_y();
89}
90
91// Returns the 'z' dimension id of the thread in the current CUDA block.
92_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
93 return __nvvm_read_ptx_sreg_tid_z();
94}
95
96// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
97_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
98 return __nvvm_read_ptx_sreg_warpsize();
99}
100
101// Returns the id of the thread inside of a CUDA warp executing together.
102_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
103 return __nvvm_read_ptx_sreg_laneid();
104}
105
106// Returns the bit-mask of active threads in the current warp.
107_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
108 return __nvvm_activemask();
109}
110
111// Copies the value from the first active thread in the warp to the rest.
112_DEFAULT_FN_ATTRS static __inline__ uint32_t
113__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
114 uint32_t __mask = (uint32_t)__lane_mask;
115 uint32_t __id = __builtin_ffs(__mask) - 1;
116 return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
117}
118
119// Copies the value from the first active thread in the warp to the rest.
120_DEFAULT_FN_ATTRS static __inline__ uint64_t
121__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
122 uint32_t __hi = (uint32_t)(__x >> 32ull);
123 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
124 uint32_t __mask = (uint32_t)__lane_mask;
125 uint32_t __id = __builtin_ffs(__mask) - 1;
126 return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __id,
127 __gpu_num_lanes() - 1)
128 << 32ull) |
129 ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
130 __gpu_num_lanes() - 1));
131}
132
133// Returns a bitmask of threads in the current lane for which \p x is true.
134_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
135 bool __x) {
136 uint32_t __mask = (uint32_t)__lane_mask;
137 return __nvvm_vote_ballot_sync(__mask, __x);
138}
139
140// Waits for all the threads in the block to converge and issues a fence.
141_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
142 __syncthreads();
143}
144
145// Waits for all threads in the warp to reconverge for independent scheduling.
146_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
147 __nvvm_bar_warp_sync((uint32_t)__lane_mask);
148}
149
150// Shuffles the the lanes inside the warp according to the given index.
151_DEFAULT_FN_ATTRS static __inline__ uint32_t
152__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
153 uint32_t __mask = (uint32_t)__lane_mask;
154 uint32_t __bitmask = (__mask >> __idx) & 1u;
155 return -__bitmask &
156 __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
157}
158
159// Shuffles the the lanes inside the warp according to the given index.
160_DEFAULT_FN_ATTRS static __inline__ uint64_t
161__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
162 uint32_t __hi = (uint32_t)(__x >> 32ull);
163 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
164 uint32_t __mask = (uint32_t)__lane_mask;
165 uint64_t __bitmask = (__mask >> __idx) & 1u;
166 return -__bitmask & ((uint64_t)__nvvm_shfl_sync_idx_i32(
167 __mask, __hi, __idx, __gpu_num_lanes() - 1u)
168 << 32ull) |
169 ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
170 __gpu_num_lanes() - 1u));
171}
172
173// Returns true if the flat pointer points to CUDA 'shared' memory.
174_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
175 return __nvvm_isspacep_shared(ptr);
176}
177
178// Returns true if the flat pointer points to CUDA 'local' memory.
179_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
180 return __nvvm_isspacep_local(ptr);
181}
182
183// Terminates execution of the calling thread.
184_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
185 __nvvm_exit();
186}
187
188// Suspend the thread briefly to assist the scheduler during busy loops.
189_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
190 if (__nvvm_reflect("__CUDA_ARCH") >= 700)
191 asm("nanosleep.u32 64;" ::: "memory");
192}
193
194_Pragma("omp end declare variant");
195_Pragma("omp end declare target");
196
197#if !defined(__cplusplus)
198_Pragma("pop_macro(\"bool\")");
199#endif
200
201#endif // __NVPTXINTRIN_H
#define _DEFAULT_FN_ATTRS
Definition: enqcmdintrin.h:18
unsigned long uint64_t
unsigned int uint32_t
static _DEFAULT_FN_ATTRS __inline__ void __gpu_thread_suspend(void)
Definition: nvptxintrin.h:189
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
Definition: nvptxintrin.h:82
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
Definition: nvptxintrin.h:102
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
Definition: nvptxintrin.h:77
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
Definition: nvptxintrin.h:107
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x)
Definition: nvptxintrin.h:152
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
Definition: nvptxintrin.h:113
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
Definition: nvptxintrin.h:121
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
Definition: nvptxintrin.h:57
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
Definition: nvptxintrin.h:92
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
Definition: nvptxintrin.h:146
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_private(void *ptr)
Definition: nvptxintrin.h:179
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_local(void *ptr)
Definition: nvptxintrin.h:174
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
Definition: nvptxintrin.h:37
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
Definition: nvptxintrin.h:62
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
Definition: nvptxintrin.h:72
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x)
Definition: nvptxintrin.h:161
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
Definition: nvptxintrin.h:67
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
Definition: nvptxintrin.h:97
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
Definition: nvptxintrin.h:87
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
Definition: nvptxintrin.h:47
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
Definition: nvptxintrin.h:52
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
Definition: nvptxintrin.h:134
static _DEFAULT_FN_ATTRS __inline__ void __gpu_exit(void)
Definition: nvptxintrin.h:184
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_threads(void)
Definition: nvptxintrin.h:141
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
Definition: nvptxintrin.h:42
_Pragma("push_macro(\"bool\")")
#define noreturn
Definition: stdnoreturn.h:17