9#ifndef __CLANG_CUDA_INTRINSICS_H__
10#define __CLANG_CUDA_INTRINSICS_H__
12#error "This file is for CUDA compilation only."
17#define __SM_30_INTRINSICS_H__
18#define __SM_30_INTRINSICS_HPP__
20#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
22#pragma push_macro("__MAKE_SHUFFLES")
23#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \
25 inline __device__ int __FnName(int __val, __Type __offset, \
26 int __width = warpSize) { \
27 return __IntIntrinsic(__val, __offset, \
28 ((warpSize - __width) << 8) | (__Mask)); \
30 inline __device__ float __FnName(float __val, __Type __offset, \
31 int __width = warpSize) { \
32 return __FloatIntrinsic(__val, __offset, \
33 ((warpSize - __width) << 8) | (__Mask)); \
35 inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
36 int __width = warpSize) { \
37 return static_cast<unsigned int>( \
38 ::__FnName(static_cast<int>(__val), __offset, __width)); \
40 inline __device__ long long __FnName(long long __val, __Type __offset, \
41 int __width = warpSize) { \
45 _Static_assert(sizeof(__val) == sizeof(__Bits)); \
46 _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
48 memcpy(&__tmp, &__val, sizeof(__val)); \
49 __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
50 __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
52 memcpy(&__ret, &__tmp, sizeof(__tmp)); \
55 inline __device__ long __FnName(long __val, __Type __offset, \
56 int __width = warpSize) { \
57 _Static_assert(sizeof(long) == sizeof(long long) || \
58 sizeof(long) == sizeof(int)); \
59 if (sizeof(long) == sizeof(long long)) { \
60 return static_cast<long>( \
61 ::__FnName(static_cast<long long>(__val), __offset, __width)); \
62 } else if (sizeof(long) == sizeof(int)) { \
63 return static_cast<long>( \
64 ::__FnName(static_cast<int>(__val), __offset, __width)); \
67 inline __device__ unsigned long __FnName( \
68 unsigned long __val, __Type __offset, int __width = warpSize) { \
69 return static_cast<unsigned long>( \
70 ::__FnName(static_cast<long>(__val), __offset, __width)); \
72 inline __device__ unsigned long long __FnName( \
73 unsigned long long __val, __Type __offset, int __width = warpSize) { \
74 return static_cast<unsigned long long>( \
75 ::__FnName(static_cast<long long>(__val), __offset, __width)); \
77 inline __device__ double __FnName(double __val, __Type __offset, \
78 int __width = warpSize) { \
80 _Static_assert(sizeof(__tmp) == sizeof(__val)); \
81 memcpy(&__tmp, &__val, sizeof(__val)); \
82 __tmp = ::__FnName(__tmp, __offset, __width); \
84 memcpy(&__ret, &__tmp, sizeof(__ret)); \
97#pragma pop_macro("__MAKE_SHUFFLES")
101#if CUDA_VERSION >= 9000
102#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
104#pragma push_macro("__MAKE_SYNC_SHUFFLES")
105#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
107 inline __device__ int __FnName(unsigned int __mask, int __val, \
108 __Type __offset, int __width = warpSize) { \
109 return __IntIntrinsic(__mask, __val, __offset, \
110 ((warpSize - __width) << 8) | (__Mask)); \
112 inline __device__ float __FnName(unsigned int __mask, float __val, \
113 __Type __offset, int __width = warpSize) { \
114 return __FloatIntrinsic(__mask, __val, __offset, \
115 ((warpSize - __width) << 8) | (__Mask)); \
117 inline __device__ unsigned int __FnName(unsigned int __mask, \
118 unsigned int __val, __Type __offset, \
119 int __width = warpSize) { \
120 return static_cast<unsigned int>( \
121 ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
123 inline __device__ long long __FnName(unsigned int __mask, long long __val, \
125 int __width = warpSize) { \
129 _Static_assert(sizeof(__val) == sizeof(__Bits)); \
130 _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
132 memcpy(&__tmp, &__val, sizeof(__val)); \
133 __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
134 __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
136 memcpy(&__ret, &__tmp, sizeof(__tmp)); \
139 inline __device__ unsigned long long __FnName( \
140 unsigned int __mask, unsigned long long __val, __Type __offset, \
141 int __width = warpSize) { \
142 return static_cast<unsigned long long>( \
143 ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
145 inline __device__ long __FnName(unsigned int __mask, long __val, \
146 __Type __offset, int __width = warpSize) { \
147 _Static_assert(sizeof(long) == sizeof(long long) || \
148 sizeof(long) == sizeof(int)); \
149 if (sizeof(long) == sizeof(long long)) { \
150 return static_cast<long>(::__FnName( \
151 __mask, static_cast<long long>(__val), __offset, __width)); \
152 } else if (sizeof(long) == sizeof(int)) { \
153 return static_cast<long>( \
154 ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
157 inline __device__ unsigned long __FnName( \
158 unsigned int __mask, unsigned long __val, __Type __offset, \
159 int __width = warpSize) { \
160 return static_cast<unsigned long>( \
161 ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \
163 inline __device__ double __FnName(unsigned int __mask, double __val, \
164 __Type __offset, int __width = warpSize) { \
166 _Static_assert(sizeof(__tmp) == sizeof(__val)); \
167 memcpy(&__tmp, &__val, sizeof(__val)); \
168 __tmp = ::__FnName(__mask, __tmp, __offset, __width); \
170 memcpy(&__ret, &__tmp, sizeof(__ret)); \
173__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
174 __nvvm_shfl_sync_idx_f32, 0x1f,
int);
177__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
178 __nvvm_shfl_sync_up_f32, 0,
unsigned int);
179__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
180 __nvvm_shfl_sync_down_f32, 0x1f,
unsigned int);
181__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
182 __nvvm_shfl_sync_bfly_f32, 0x1f,
int);
183#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
185inline __device__ void __syncwarp(
unsigned int mask = 0xffffffff) {
186 return __nvvm_bar_warp_sync(mask);
189inline __device__ void __barrier_sync(
unsigned int id) {
190 __nvvm_barrier_sync(
id);
193inline __device__ void __barrier_sync_count(
unsigned int id,
194 unsigned int count) {
195 __nvvm_barrier_sync_cnt(
id, count);
198inline __device__ int __all_sync(
unsigned int mask,
int pred) {
199 return __nvvm_vote_all_sync(mask, pred);
202inline __device__ int __any_sync(
unsigned int mask,
int pred) {
203 return __nvvm_vote_any_sync(mask, pred);
206inline __device__ int __uni_sync(
unsigned int mask,
int pred) {
207 return __nvvm_vote_uni_sync(mask, pred);
210inline __device__ unsigned int __ballot_sync(
unsigned int mask,
int pred) {
211 return __nvvm_vote_ballot_sync(mask, pred);
214inline __device__ unsigned int __activemask() {
215#if CUDA_VERSION < 9020
216 return __nvvm_vote_ballot(1);
218 return __nvvm_activemask();
222inline __device__ unsigned int __fns(
unsigned mask,
unsigned base,
int offset) {
223 return __nvvm_fns(mask, base, offset);
229#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
230inline __device__ unsigned int __match32_any_sync(
unsigned int mask,
231 unsigned int value) {
232 return __nvvm_match_any_sync_i32(mask, value);
236__match64_any_sync(
unsigned int mask,
unsigned long long value) {
237 return __nvvm_match_any_sync_i64(mask, value);
241__match32_all_sync(
unsigned int mask,
unsigned int value,
int *pred) {
242 return __nvvm_match_all_sync_i32p(mask, value, pred);
246__match64_all_sync(
unsigned int mask,
unsigned long long value,
int *pred) {
247 return __nvvm_match_all_sync_i64p(mask, value, pred);
249#include "crt/sm_70_rt.hpp"
257#define __SM_32_INTRINSICS_H__
258#define __SM_32_INTRINSICS_HPP__
260#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
267 return __nvvm_ldg_ll(ptr);
270 return __nvvm_ldg_uc(ptr);
273 return __nvvm_ldg_uc((
const unsigned char *)ptr);
276 return __nvvm_ldg_us(ptr);
279 return __nvvm_ldg_ui(ptr);
282 return __nvvm_ldg_ul(ptr);
285 return __nvvm_ldg_ull(ptr);
295 c2 rv = __nvvm_ldg_c2(
reinterpret_cast<const c2 *
>(ptr));
303 c4 rv = __nvvm_ldg_c4(
reinterpret_cast<const c4 *
>(ptr));
313 s2 rv = __nvvm_ldg_s2(
reinterpret_cast<const s2 *
>(ptr));
321 s4 rv = __nvvm_ldg_s4(
reinterpret_cast<const s4 *
>(ptr));
331 i2 rv = __nvvm_ldg_i2(
reinterpret_cast<const i2 *
>(ptr));
339 i4 rv = __nvvm_ldg_i4(
reinterpret_cast<const i4 *
>(ptr));
349 ll2 rv = __nvvm_ldg_ll2(
reinterpret_cast<const ll2 *
>(ptr));
357 typedef unsigned char uc2
__attribute__((ext_vector_type(2)));
358 uc2 rv = __nvvm_ldg_uc2(
reinterpret_cast<const uc2 *
>(ptr));
365 typedef unsigned char uc4
__attribute__((ext_vector_type(4)));
366 uc4 rv = __nvvm_ldg_uc4(
reinterpret_cast<const uc4 *
>(ptr));
375 typedef unsigned short us2
__attribute__((ext_vector_type(2)));
376 us2 rv = __nvvm_ldg_us2(
reinterpret_cast<const us2 *
>(ptr));
383 typedef unsigned short us4
__attribute__((ext_vector_type(4)));
384 us4 rv = __nvvm_ldg_us4(
reinterpret_cast<const us4 *
>(ptr));
393 typedef unsigned int ui2
__attribute__((ext_vector_type(2)));
394 ui2 rv = __nvvm_ldg_ui2(
reinterpret_cast<const ui2 *
>(ptr));
401 typedef unsigned int ui4
__attribute__((ext_vector_type(4)));
402 ui4 rv = __nvvm_ldg_ui4(
reinterpret_cast<const ui4 *
>(ptr));
411 typedef unsigned long long ull2
__attribute__((ext_vector_type(2)));
412 ull2 rv = __nvvm_ldg_ull2(
reinterpret_cast<const ull2 *
>(ptr));
421 f2 rv = __nvvm_ldg_f2(
reinterpret_cast<const f2 *
>(ptr));
429 f4 rv = __nvvm_ldg_f4(
reinterpret_cast<const f4 *
>(ptr));
439 d2 rv = __nvvm_ldg_d2(
reinterpret_cast<const d2 *
>(ptr));
450 unsigned shiftWidth) {
452 asm(
"shf.l.wrap.b32 %0, %1, %2, %3;"
454 :
"r"(low32),
"r"(high32),
"r"(shiftWidth));
458 unsigned shiftWidth) {
460 asm(
"shf.l.clamp.b32 %0, %1, %2, %3;"
462 :
"r"(low32),
"r"(high32),
"r"(shiftWidth));
466 unsigned shiftWidth) {
468 asm(
"shf.r.wrap.b32 %0, %1, %2, %3;"
470 :
"r"(low32),
"r"(high32),
"r"(shiftWidth));
474 unsigned shiftWidth) {
476 asm(
"shf.r.clamp.b32 %0, %1, %2, %3;"
478 :
"r"(low32),
"r"(high32),
"r"(shiftWidth));
482#if defined(__cplusplus) && (__cplusplus >= 201103L)
484#pragma push_macro("__INTRINSIC_LOAD")
485#define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
487 inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
489 asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber); \
490 return (__DeclType)__ret; \
493#pragma push_macro("__INTRINSIC_LOAD2")
494#define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
496 inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
499 asm(__AsmOp " {%0,%1}, [%2];" \
500 : __AsmType(__tmp.x), __AsmType(__tmp.y) \
501 : "l"(__ptr)__Clobber); \
502 using __ElementType = decltype(__ret.x); \
503 __ret.x = (__ElementType)(__tmp.x); \
504 __ret.y = (__ElementType)__tmp.y; \
508#pragma push_macro("__INTRINSIC_LOAD4")
509#define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
511 inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
514 asm(__AsmOp " {%0,%1,%2,%3}, [%4];" \
515 : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z), \
517 : "l"(__ptr)__Clobber); \
518 using __ElementType = decltype(__ret.x); \
519 __ret.x = (__ElementType)__tmp.x; \
520 __ret.y = (__ElementType)__tmp.y; \
521 __ret.z = (__ElementType)__tmp.z; \
522 __ret.w = (__ElementType)__tmp.w; \
526__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.s8",
char,
unsigned int,
"=r", );
527__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.s8",
signed char,
unsigned int,
"=r", );
528__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.s16",
short,
unsigned short,
"=h", );
529__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.s32",
int,
unsigned int,
"=r", );
530__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.s64",
long long,
unsigned long long,
533__INTRINSIC_LOAD2(__ldcg,
"ld.global.cg.v2.s8", char2, int2,
"=r", );
534__INTRINSIC_LOAD4(__ldcg,
"ld.global.cg.v4.s8", char4, int4,
"=r", );
535__INTRINSIC_LOAD2(__ldcg,
"ld.global.cg.v2.s16", short2, short2,
"=h", );
536__INTRINSIC_LOAD4(__ldcg,
"ld.global.cg.v4.s16", short4, short4,
"=h", );
537__INTRINSIC_LOAD2(__ldcg,
"ld.global.cg.v2.s32", int2, int2,
"=r", );
538__INTRINSIC_LOAD4(__ldcg,
"ld.global.cg.v4.s32", int4, int4,
"=r", );
539__INTRINSIC_LOAD2(__ldcg,
"ld.global.cg.v2.s64 ", longlong2, longlong2,
"=l", );
541__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.u8",
unsigned char,
unsigned int,
543__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.u16",
unsigned short,
unsigned short,
545__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.u32",
unsigned int,
unsigned int,
547__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.u64",
unsigned long long,
548 unsigned long long,
"=l", );
550__INTRINSIC_LOAD2(__ldcg,
"ld.global.cg.v2.u8", uchar2, int2,
"=r", );
551__INTRINSIC_LOAD4(__ldcg,
"ld.global.cg.v4.u8", uchar4, int4,
"=r", );
552__INTRINSIC_LOAD2(__ldcg,
"ld.global.cg.v2.u16", ushort2, ushort2,
"=h", );
553__INTRINSIC_LOAD4(__ldcg,
"ld.global.cg.v4.u16", ushort4, ushort4,
"=h", );
554__INTRINSIC_LOAD2(__ldcg,
"ld.global.cg.v2.u32", uint2, uint2,
"=r", );
555__INTRINSIC_LOAD4(__ldcg,
"ld.global.cg.v4.u32", uint4, uint4,
"=r", );
556__INTRINSIC_LOAD2(__ldcg,
"ld.global.cg.v2.u64", ulonglong2, ulonglong2,
559__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.f32",
float,
float,
"=f", );
560__INTRINSIC_LOAD(__ldcg,
"ld.global.cg.f64",
double,
double,
"=d", );
561__INTRINSIC_LOAD2(__ldcg,
"ld.global.cg.v2.f32", float2, float2,
"=f", );
562__INTRINSIC_LOAD4(__ldcg,
"ld.global.cg.v4.f32", float4, float4,
"=f", );
563__INTRINSIC_LOAD2(__ldcg,
"ld.global.cg.v2.f64", double2, double2,
"=d", );
565inline __device__ long __ldcg(
const long *__ptr) {
567 if (
sizeof(
long) == 8) {
568 asm(
"ld.global.cg.s64 %0, [%1];" :
"=l"(__ret) :
"l"(__ptr));
570 asm(
"ld.global.cg.s32 %0, [%1];" :
"=r"(__ret) :
"l"(__ptr));
575__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.u8",
unsigned char,
unsigned int,
577__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.u16",
unsigned short,
unsigned short,
579__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.u32",
unsigned int,
unsigned int,
581__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.u64",
unsigned long long,
582 unsigned long long,
"=l", :
"memory");
584__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.s8",
char,
unsigned int,
586__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.s8",
signed char,
unsigned int,
588__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.s16",
short,
unsigned short,
590__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.s32",
int,
unsigned int,
592__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.s64",
long long,
unsigned long long,
595__INTRINSIC_LOAD2(__ldcv,
"ld.global.cv.v2.u8", uchar2, uint2,
597__INTRINSIC_LOAD4(__ldcv,
"ld.global.cv.v4.u8", uchar4, uint4,
599__INTRINSIC_LOAD2(__ldcv,
"ld.global.cv.v2.u16", ushort2, ushort2,
601__INTRINSIC_LOAD4(__ldcv,
"ld.global.cv.v4.u16", ushort4, ushort4,
603__INTRINSIC_LOAD2(__ldcv,
"ld.global.cv.v2.u32", uint2, uint2,
605__INTRINSIC_LOAD4(__ldcv,
"ld.global.cv.v4.u32", uint4, uint4,
607__INTRINSIC_LOAD2(__ldcv,
"ld.global.cv.v2.u64", ulonglong2, ulonglong2,
610__INTRINSIC_LOAD2(__ldcv,
"ld.global.cv.v2.s8", char2, int2,
"=r", :
"memory");
611__INTRINSIC_LOAD4(__ldcv,
"ld.global.cv.v4.s8", char4, int4,
"=r", :
"memory");
612__INTRINSIC_LOAD2(__ldcv,
"ld.global.cv.v2.s16", short2, short2,
614__INTRINSIC_LOAD4(__ldcv,
"ld.global.cv.v4.s16", short4, short4,
616__INTRINSIC_LOAD2(__ldcv,
"ld.global.cv.v2.s32", int2, int2,
"=r", :
"memory");
617__INTRINSIC_LOAD4(__ldcv,
"ld.global.cv.v4.s32", int4, int4,
"=r", :
"memory");
618__INTRINSIC_LOAD2(__ldcv,
"ld.global.cv.v2.s64", longlong2, longlong2,
621__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.f32",
float,
float,
"=f", :
"memory");
622__INTRINSIC_LOAD(__ldcv,
"ld.global.cv.f64",
double,
double,
"=d", :
"memory");
624__INTRINSIC_LOAD2(__ldcv,
"ld.global.cv.v2.f32", float2, float2,
626__INTRINSIC_LOAD4(__ldcv,
"ld.global.cv.v4.f32", float4, float4,
628__INTRINSIC_LOAD2(__ldcv,
"ld.global.cv.v2.f64", double2, double2,
631inline __device__ long __ldcv(
const long *__ptr) {
633 if (
sizeof(
long) == 8) {
634 asm(
"ld.global.cv.s64 %0, [%1];" :
"=l"(__ret) :
"l"(__ptr));
636 asm(
"ld.global.cv.s32 %0, [%1];" :
"=r"(__ret) :
"l"(__ptr));
641__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.s8",
char,
unsigned int,
"=r", );
642__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.s8",
signed char,
signed int,
"=r", );
643__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.s16",
short,
unsigned short,
"=h", );
644__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.s32",
int,
unsigned int,
"=r", );
645__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.s64",
long long,
unsigned long long,
648__INTRINSIC_LOAD2(__ldcs,
"ld.global.cs.v2.s8", char2, int2,
"=r", );
649__INTRINSIC_LOAD4(__ldcs,
"ld.global.cs.v4.s8", char4, int4,
"=r", );
650__INTRINSIC_LOAD2(__ldcs,
"ld.global.cs.v2.s16", short2, short2,
"=h", );
651__INTRINSIC_LOAD4(__ldcs,
"ld.global.cs.v4.s16", short4, short4,
"=h", );
652__INTRINSIC_LOAD2(__ldcs,
"ld.global.cs.v2.s32", int2, int2,
"=r", );
653__INTRINSIC_LOAD4(__ldcs,
"ld.global.cs.v4.s32", int4, int4,
"=r", );
654__INTRINSIC_LOAD2(__ldcs,
"ld.global.cs.v2.s64", longlong2, longlong2,
"=l", );
656__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.u8",
unsigned char,
unsigned int,
658__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.u16",
unsigned short,
unsigned short,
660__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.u32",
unsigned int,
unsigned int,
662__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.u64",
unsigned long long,
663 unsigned long long,
"=l", );
665__INTRINSIC_LOAD2(__ldcs,
"ld.global.cs.v2.u8", uchar2, uint2,
"=r", );
666__INTRINSIC_LOAD4(__ldcs,
"ld.global.cs.v4.u8", uchar4, uint4,
"=r", );
667__INTRINSIC_LOAD2(__ldcs,
"ld.global.cs.v2.u16", ushort2, ushort2,
"=h", );
668__INTRINSIC_LOAD4(__ldcs,
"ld.global.cs.v4.u16", ushort4, ushort4,
"=h", );
669__INTRINSIC_LOAD2(__ldcs,
"ld.global.cs.v2.u32", uint2, uint2,
"=r", );
670__INTRINSIC_LOAD4(__ldcs,
"ld.global.cs.v4.u32", uint4, uint4,
"=r", );
671__INTRINSIC_LOAD2(__ldcs,
"ld.global.cs.v2.u64", ulonglong2, ulonglong2,
674__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.f32",
float,
float,
"=f", );
675__INTRINSIC_LOAD(__ldcs,
"ld.global.cs.f64",
double,
double,
"=d", );
676__INTRINSIC_LOAD2(__ldcs,
"ld.global.cs.v2.f32", float2, float2,
"=f", );
677__INTRINSIC_LOAD4(__ldcs,
"ld.global.cs.v4.f32", float4, float4,
"=f", );
678__INTRINSIC_LOAD2(__ldcs,
"ld.global.cs.v2.f64", double2, double2,
"=d", );
680#pragma pop_macro("__INTRINSIC_LOAD")
681#pragma pop_macro("__INTRINSIC_LOAD2")
682#pragma pop_macro("__INTRINSIC_LOAD4")
684inline __device__ long __ldcs(
const long *__ptr) {
686 if (
sizeof(
long) == 8) {
687 asm(
"ld.global.cs.s64 %0, [%1];" :
"=l"(__ret) :
"l"(__ptr));
689 asm(
"ld.global.cs.s32 %0, [%1];" :
"=r"(__ret) :
"l"(__ptr));
694#pragma push_macro("__INTRINSIC_STORE")
695#define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \
696 inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
697 __TmpType __tmp = (__TmpType)__value; \
698 asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \
701#pragma push_macro("__INTRINSIC_STORE2")
702#define __INTRINSIC_STORE2(__FnName, __AsmOp, __DeclType, __TmpType, \
704 inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
706 using __ElementType = decltype(__tmp.x); \
707 __tmp.x = (__ElementType)(__value.x); \
708 __tmp.y = (__ElementType)(__value.y); \
709 asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \
714#pragma push_macro("__INTRINSIC_STORE4")
715#define __INTRINSIC_STORE4(__FnName, __AsmOp, __DeclType, __TmpType, \
717 inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
719 using __ElementType = decltype(__tmp.x); \
720 __tmp.x = (__ElementType)(__value.x); \
721 __tmp.y = (__ElementType)(__value.y); \
722 __tmp.z = (__ElementType)(__value.z); \
723 __tmp.w = (__ElementType)(__value.w); \
724 asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \
725 __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \
729__INTRINSIC_STORE(__stwt,
"st.global.wt.s8",
char,
int,
"r");
730__INTRINSIC_STORE(__stwt,
"st.global.wt.s8",
signed char,
int,
"r");
731__INTRINSIC_STORE(__stwt,
"st.global.wt.s16",
short,
short,
"h");
732__INTRINSIC_STORE(__stwt,
"st.global.wt.s32",
int,
int,
"r");
733__INTRINSIC_STORE(__stwt,
"st.global.wt.s64",
long long,
long long,
"l");
735__INTRINSIC_STORE2(__stwt,
"st.global.wt.v2.s8", char2, int2,
"r");
736__INTRINSIC_STORE4(__stwt,
"st.global.wt.v4.s8", char4, int4,
"r");
737__INTRINSIC_STORE2(__stwt,
"st.global.wt.v2.s16", short2, short2,
"h");
738__INTRINSIC_STORE4(__stwt,
"st.global.wt.v4.s16", short4, short4,
"h");
739__INTRINSIC_STORE2(__stwt,
"st.global.wt.v2.s32", int2, int2,
"r");
740__INTRINSIC_STORE4(__stwt,
"st.global.wt.v4.s32", int4, int4,
"r");
741__INTRINSIC_STORE2(__stwt,
"st.global.wt.v2.s64", longlong2, longlong2,
"l");
743__INTRINSIC_STORE(__stwt,
"st.global.wt.u8",
unsigned char,
int,
"r");
744__INTRINSIC_STORE(__stwt,
"st.global.wt.u16",
unsigned short,
unsigned short,
746__INTRINSIC_STORE(__stwt,
"st.global.wt.u32",
unsigned int,
unsigned int,
"r");
747__INTRINSIC_STORE(__stwt,
"st.global.wt.u64",
unsigned long long,
748 unsigned long long,
"l");
750__INTRINSIC_STORE2(__stwt,
"st.global.wt.v2.u8", uchar2, uchar2,
"r");
751__INTRINSIC_STORE4(__stwt,
"st.global.wt.v4.u8", uchar4, uint4,
"r");
752__INTRINSIC_STORE2(__stwt,
"st.global.wt.v2.u16", ushort2, ushort2,
"h");
753__INTRINSIC_STORE4(__stwt,
"st.global.wt.v4.u16", ushort4, ushort4,
"h");
754__INTRINSIC_STORE2(__stwt,
"st.global.wt.v2.u32", uint2, uint2,
"r");
755__INTRINSIC_STORE4(__stwt,
"st.global.wt.v4.u32", uint4, uint4,
"r");
756__INTRINSIC_STORE2(__stwt,
"st.global.wt.v2.u64", ulonglong2, ulonglong2,
"l");
758__INTRINSIC_STORE(__stwt,
"st.global.wt.f32",
float,
float,
"f");
759__INTRINSIC_STORE(__stwt,
"st.global.wt.f64",
double,
double,
"d");
760__INTRINSIC_STORE2(__stwt,
"st.global.wt.v2.f32", float2, float2,
"f");
761__INTRINSIC_STORE4(__stwt,
"st.global.wt.v4.f32", float4, float4,
"f");
762__INTRINSIC_STORE2(__stwt,
"st.global.wt.v2.f64", double2, double2,
"d");
764#pragma pop_macro("__INTRINSIC_STORE")
765#pragma pop_macro("__INTRINSIC_STORE2")
766#pragma pop_macro("__INTRINSIC_STORE4")
771#if CUDA_VERSION >= 11000
773__device__ inline size_t __nv_cvta_generic_to_global_impl(
const void *__ptr) {
774 return (
size_t)(
void __attribute__((address_space(1))) *)__ptr;
776__device__ inline size_t __nv_cvta_generic_to_shared_impl(
const void *__ptr) {
777 return (
size_t)(
void __attribute__((address_space(3))) *)__ptr;
779__device__ inline size_t __nv_cvta_generic_to_constant_impl(
const void *__ptr) {
780 return (
size_t)(
void __attribute__((address_space(4))) *)__ptr;
782__device__ inline size_t __nv_cvta_generic_to_local_impl(
const void *__ptr) {
783 return (
size_t)(
void __attribute__((address_space(5))) *)__ptr;
785__device__ inline void *__nv_cvta_global_to_generic_impl(
size_t __ptr) {
786 return (
void *)(
void __attribute__((address_space(1))) *)__ptr;
788__device__ inline void *__nv_cvta_shared_to_generic_impl(
size_t __ptr) {
789 return (
void *)(
void __attribute__((address_space(3))) *)__ptr;
791__device__ inline void *__nv_cvta_constant_to_generic_impl(
size_t __ptr) {
792 return (
void *)(
void __attribute__((address_space(4))) *)__ptr;
794__device__ inline void *__nv_cvta_local_to_generic_impl(
size_t __ptr) {
795 return (
void *)(
void __attribute__((address_space(5))) *)__ptr;
797__device__ inline cuuint32_t __nvvm_get_smem_pointer(
void *__ptr) {
798 return __nv_cvta_generic_to_shared_impl(__ptr);
802#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
803__device__ inline unsigned __reduce_add_sync(
unsigned __mask,
805 return __nvvm_redux_sync_add(
__value, __mask);
807__device__ inline unsigned __reduce_min_sync(
unsigned __mask,
809 return __nvvm_redux_sync_umin(
__value, __mask);
811__device__ inline unsigned __reduce_max_sync(
unsigned __mask,
813 return __nvvm_redux_sync_umax(
__value, __mask);
816 return __nvvm_redux_sync_min(
__value, __mask);
819 return __nvvm_redux_sync_max(
__value, __mask);
822 return __nvvm_redux_sync_or(
__value, __mask);
824__device__ inline unsigned __reduce_and_sync(
unsigned __mask,
826 return __nvvm_redux_sync_and(
__value, __mask);
828__device__ inline unsigned __reduce_xor_sync(
unsigned __mask,
830 return __nvvm_redux_sync_xor(
__value, __mask);
833__device__ inline void __nv_memcpy_async_shared_global_4(
void *__dst,
835 unsigned __src_size) {
836 __nvvm_cp_async_ca_shared_global_4(
840__device__ inline void __nv_memcpy_async_shared_global_8(
void *__dst,
842 unsigned __src_size) {
843 __nvvm_cp_async_ca_shared_global_8(
847__device__ inline void __nv_memcpy_async_shared_global_16(
void *__dst,
849 unsigned __src_size) {
850 __nvvm_cp_async_ca_shared_global_16(
856__nv_associate_access_property(
const void *__ptr,
unsigned long long __prop) {
865#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
866__device__ inline unsigned __isCtaShared(
const void *ptr) {
867 return __isShared(ptr);
870__device__ inline unsigned __isClusterShared(
const void *__ptr) {
871 return __nvvm_isspacep_shared_cluster(__ptr);
874__device__ inline void *__cluster_map_shared_rank(
const void *__ptr,
876 return __nvvm_mapa((
void *)__ptr, __rank);
879__device__ inline unsigned __cluster_query_shared_rank(
const void *__ptr) {
880 return __nvvm_getctarank((
void *)__ptr);
884__cluster_map_shared_multicast(
const void *__ptr,
885 unsigned int __cluster_cta_mask) {
886 return make_uint2((
unsigned)__cvta_generic_to_shared(__ptr),
890__device__ inline unsigned __clusterDimIsSpecified() {
891 return __nvvm_is_explicit_cluster();
895 return dim3(__nvvm_read_ptx_sreg_cluster_nctaid_x(),
896 __nvvm_read_ptx_sreg_cluster_nctaid_y(),
897 __nvvm_read_ptx_sreg_cluster_nctaid_z());
901 return dim3(__nvvm_read_ptx_sreg_cluster_ctaid_x(),
902 __nvvm_read_ptx_sreg_cluster_ctaid_y(),
903 __nvvm_read_ptx_sreg_cluster_ctaid_z());
907 return dim3(__nvvm_read_ptx_sreg_nclusterid_x(),
908 __nvvm_read_ptx_sreg_nclusterid_y(),
909 __nvvm_read_ptx_sreg_nclusterid_z());
913 return dim3(__nvvm_read_ptx_sreg_clusterid_x(),
914 __nvvm_read_ptx_sreg_clusterid_y(),
915 __nvvm_read_ptx_sreg_clusterid_z());
918__device__ inline unsigned __clusterRelativeBlockRank() {
919 return __nvvm_read_ptx_sreg_cluster_ctarank();
922__device__ inline unsigned __clusterSizeInBlocks() {
923 return __nvvm_read_ptx_sreg_cluster_nctarank();
926__device__ inline void __cluster_barrier_arrive() {
927 __nvvm_barrier_cluster_arrive();
930__device__ inline void __cluster_barrier_arrive_relaxed() {
931 __nvvm_barrier_cluster_arrive_relaxed();
934__device__ inline void __cluster_barrier_wait() {
935 __nvvm_barrier_cluster_wait();
938__device__ inline void __threadfence_cluster() { __nvvm_fence_sc_cluster(); }
942 __asm__(
"atom.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
943 :
"=f"(__ret.x),
"=f"(__ret.y)
944 :
"l"(__ptr),
"f"(__val.x),
"f"(__val.y));
950 __asm__(
"atom.cta.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
951 :
"=f"(__ret.x),
"=f"(__ret.y)
952 :
"l"(__ptr),
"f"(__val.x),
"f"(__val.y));
958 __asm__(
"atom.sys.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
959 :
"=f"(__ret.x),
"=f"(__ret.y)
960 :
"l"(__ptr),
"f"(__val.x),
"f"(__val.y));
966 __asm__(
"atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
967 :
"=f"(__ret.x),
"=f"(__ret.y),
"=f"(__ret.z),
"=f"(__ret.w)
968 :
"l"(__ptr),
"f"(__val.x),
"f"(__val.y),
"f"(__val.z),
"f"(__val.w));
975 "atom.cta.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
976 :
"=f"(__ret.x),
"=f"(__ret.y),
"=f"(__ret.z),
"=f"(__ret.w)
977 :
"l"(__ptr),
"f"(__val.x),
"f"(__val.y),
"f"(__val.z),
"f"(__val.w));
984 "atom.sys.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
985 :
"=f"(__ret.x),
"=f"(__ret.y),
"=f"(__ret.z),
"=f"(__ret.w)
986 :
"l"(__ptr),
"f"(__val.x),
"f"(__val.y),
"f"(__val.z),
"f"(__val.w)
__device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, unsigned shiftWidth)
__device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, unsigned shiftWidth)
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, __Type)
__device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, unsigned shiftWidth)
__device__ char __ldg(const char *ptr)
__device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, unsigned shiftWidth)
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__asm__("swp %0, %1, [%2]" :"=r"(__v) :"r"(__x), "r"(__p) :"memory")
static __inline__ void const void * __src
static __inline__ void unsigned int __value
vector< float, 4 > float4
vector< float, 2 > float2