clang 22.0.0git
__clang_cuda_texture_intrinsics.h
Go to the documentation of this file.
1/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 *
9 * This header provides in-header implmentations for NVCC's built-in
10 * __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The
11 * built-in is unusual as it's actually a set of function overloads that use the
12 * first string literal argument as one of the overload parameters.
13 */
14#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
15#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
16#ifndef __CUDA__
17#error "This file is for CUDA compilation only."
18#endif
19
20// __nv_tex_surf_handler() provided by this header as a macro.
21#define __nv_tex_surf_handler(__op, __ptr, ...) \
22 ::__cuda_tex::__tex_fetch< \
23 ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \
24 __VA_ARGS__)
25
26#pragma push_macro("__ASM_OUT")
27#pragma push_macro("__ASM_OUTP")
28#pragma push_macro("__Args")
29#pragma push_macro("__ID")
30#pragma push_macro("__IDV")
31#pragma push_macro("__OP_TYPE_SURFACE")
32#pragma push_macro("__IMPL_2DGATHER")
33#pragma push_macro("__IMPL_ALIAS")
34#pragma push_macro("__IMPL_ALIASI")
35#pragma push_macro("__IMPL_F1")
36#pragma push_macro("__IMPL_F3")
37#pragma push_macro("__IMPL_F3N")
38#pragma push_macro("__IMPL_F3S")
39#pragma push_macro("__IMPL_S")
40#pragma push_macro("__IMPL_S3")
41#pragma push_macro("__IMPL_S3I")
42#pragma push_macro("__IMPL_S3N")
43#pragma push_macro("__IMPL_S3NI")
44#pragma push_macro("__IMPL_S3S")
45#pragma push_macro("__IMPL_S3SI")
46#pragma push_macro("__IMPL_SI")
47#pragma push_macro("__L")
48#pragma push_macro("__STRIP_PARENS")
49#pragma push_macro("__SURF_WRITE_V2")
50#pragma push_macro("__SW_ASM_ARGS")
51#pragma push_macro("__SW_ASM_ARGS1")
52#pragma push_macro("__SW_ASM_ARGS2")
53#pragma push_macro("__SW_ASM_ARGS4")
54#pragma push_macro("__SURF_WRITE_V2")
55#pragma push_macro("__SURF_READ_V2")
56#pragma push_macro("__SW_ASM_ARGS")
57#pragma push_macro("__SW_ASM_ARGS1")
58#pragma push_macro("__SW_ASM_ARGS2")
59#pragma push_macro("__SW_ASM_ARGS4")
60#pragma push_macro("__SURF_READ1D");
61#pragma push_macro("__SURF_READ2D");
62#pragma push_macro("__SURF_READ3D");
63#pragma push_macro("__SURF_READ1DLAYERED");
64#pragma push_macro("__SURF_READ2DLAYERED");
65#pragma push_macro("__SURF_READCUBEMAP");
66#pragma push_macro("__SURF_READCUBEMAPLAYERED");
67#pragma push_macro("__1DV1");
68#pragma push_macro("__1DV2");
69#pragma push_macro("__1DV4");
70#pragma push_macro("__2DV1");
71#pragma push_macro("__2DV2");
72#pragma push_macro("__2DV4");
73#pragma push_macro("__1DLAYERV1");
74#pragma push_macro("__1DLAYERV2");
75#pragma push_macro("__1DLAYERV4");
76#pragma push_macro("__3DV1");
77#pragma push_macro("__3DV2");
78#pragma push_macro("__3DV4");
79#pragma push_macro("__2DLAYERV1");
80#pragma push_macro("__2DLAYERV2");
81#pragma push_macro("__2DLAYERV4");
82#pragma push_macro("__CUBEMAPV1");
83#pragma push_macro("__CUBEMAPV2");
84#pragma push_macro("__CUBEMAPV4");
85#pragma push_macro("__CUBEMAPLAYERV1");
86#pragma push_macro("__CUBEMAPLAYERV2");
87#pragma push_macro("__CUBEMAPLAYERV4");
88#pragma push_macro("__SURF_READXD_ALL");
89#pragma push_macro("__SURF_WRITE1D_V2");
90#pragma push_macro("__SURF_WRITE1DLAYERED_V2");
91#pragma push_macro("__SURF_WRITE2D_V2");
92#pragma push_macro("__SURF_WRITE2DLAYERED_V2");
93#pragma push_macro("__SURF_WRITE3D_V2");
94#pragma push_macro("__SURF_CUBEMAPWRITE_V2");
95#pragma push_macro("__SURF_CUBEMAPLAYEREDWRITE_V2");
96#pragma push_macro("__SURF_WRITEXD_V2_ALL");
97#pragma push_macro("__1DV1");
98#pragma push_macro("__1DV2");
99#pragma push_macro("__1DV4");
100#pragma push_macro("__2DV1");
101#pragma push_macro("__2DV2");
102#pragma push_macro("__2DV4");
103#pragma push_macro("__3DV1");
104#pragma push_macro("__3DV2");
105#pragma push_macro("__3DV4");
106
107// Put all functions into anonymous namespace so they have internal linkage.
108// The device-only function here must be internal in order to avoid ODR
109// violations in case they are used from the files compiled with
110// -fgpu-rdc. E.g. a library and an app using it may be built with a different
111// version of this header file.
112namespace {
113
114// Put the implmentation into its own namespace so we don't pollute the TU.
115namespace __cuda_tex {
116
117// First, we need a perfect hash function and a few constexpr helper functions
118// for converting a string literal into a numeric value which can be used to
119// parametrize a template. We can not use string literals for that as that would
120// require C++20.
121//
122// The hash function was generated with 'gperf' and then manually converted into
123// its constexpr equivalent.
124//
125// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
126// function has a collision for any of the texture operations, the compilation
127// will fail due to an attempt to redefine a tag with the same value. If the
128// header compiles, then the hash function is good enough for the job.
129
130constexpr int __tex_len(const char *s) {
131 return (s[0] == 0) ? 0
132 : (s[1] == 0) ? 1
133 : (s[2] == 0) ? 2
134 : (s[3] == 0) ? 3
135 : (s[4] == 0) ? 4
136 : (s[5] == 0) ? 5
137 : (s[6] == 0) ? 6
138 : (s[7] == 0) ? 7
139 : (s[8] == 0) ? 8
140 : (s[9] == 0) ? 9
141 : (s[10] == 0) ? 10
142 : (s[11] == 0) ? 11
143 : (s[12] == 0) ? 12
144 : (s[13] == 0) ? 13
145 : (s[14] == 0) ? 14
146 : (s[15] == 0) ? 15
147 : (s[16] == 0) ? 16
148 : (s[17] == 0) ? 17
149 : (s[18] == 0) ? 18
150 : (s[19] == 0) ? 19
151 : (s[20] == 0) ? 20
152 : (s[21] == 0) ? 21
153 : (s[22] == 0) ? 22
154 : (s[23] == 0) ? 23
155 : (s[24] == 0) ? 24
156 : (s[25] == 0) ? 25
157 : (s[26] == 0) ? 26
158 : (s[27] == 0) ? 27
159 : (s[28] == 0) ? 28
160 : (s[29] == 0) ? 29
161 : (s[30] == 0) ? 30
162 : (s[31] == 0) ? 31
163 : 32;
164}
165
166constexpr int __tex_hash_map(int c) {
167 return (c == 49) ? 10
168 : (c == 50) ? 0
169 : (c == 51) ? 100
170 : (c == 52) ? 30
171 : (c == 67) ? 10
172 : (c == 68) ? 0
173 : (c == 69) ? 25
174 : (c == 72) ? 70
175 : (c == 77) ? 0
176 : (c == 96) ? 44
177 : (c == 99) ? 10
178 : (c == 100) ? 5
179 : (c == 101) ? 60
180 : (c == 102) ? 40
181 : (c == 103) ? 70
182 : (c == 104) ? 25
183 : (c == 112) ? 0
184 : (c == 114) ? 45
185 : (c == 117) ? 5
186 : (c == 118) ? 85
187 : (c == 120) ? 20
188 : 225;
189}
190
191constexpr int __tex_op_hash(const char *str) {
192 return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
193 __tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
194}
195
196// Tag type to identify particular texture operation.
197template <int N> struct __Tag;
198#define __ID(__op) __Tag<__tex_op_hash(__op)>
199// Tags for variants of particular operation. E.g. tex2Dgather can translate
200// into 4 different instructions.
201#define __IDV(__op, __variant) \
202 __Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
203
204// Helper classes for figuring out key data types for derived types.
205// E.g. char2 has __base_t = char, __fetch_t = char4
206template <class> struct __TypeInfoT;
207// Type info for the fundamental types.
208template <> struct __TypeInfoT<float> {
209 using __base_t = float;
210 using __fetch_t = float4;
211};
212template <> struct __TypeInfoT<char> {
213 using __base_t = char;
214 using __fetch_t = int4;
215};
216template <> struct __TypeInfoT<signed char> {
217 using __base_t = signed char;
218 using __fetch_t = int4;
219};
220template <> struct __TypeInfoT<unsigned char> {
221 using __base_t = unsigned char;
222 using __fetch_t = uint4;
223};
224template <> struct __TypeInfoT<short> {
225 using __base_t = short;
226 using __fetch_t = int4;
227};
228template <> struct __TypeInfoT<unsigned short> {
229 using __base_t = unsigned short;
230 using __fetch_t = uint4;
231};
232template <> struct __TypeInfoT<int> {
233 using __base_t = int;
234 using __fetch_t = int4;
235};
236template <> struct __TypeInfoT<unsigned int> {
237 using __base_t = unsigned int;
238 using __fetch_t = uint4;
239};
240
241// Derived base/fetch types for N-element vectors.
242template <class __T> struct __TypeInfoT {
243 using __base_t = decltype(__T::x);
244 using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
245};
246
247// Tag structs to distinguish operation types
248struct __texture_op_tag {};
249struct __surface_op_tag {};
250
251// Template specialization to determine operation type based on tag value
252template <class __op> struct __op_type_traits {
253 using type = __texture_op_tag;
254};
255
256// Specialize for known surface operation tags
257#define __OP_TYPE_SURFACE(__op) \
258 template <> struct __op_type_traits<__op> { \
259 using type = __surface_op_tag; \
260 }
261
262// Classes that implement specific texture ops.
263template <class __op> struct __tex_fetch_v4;
264
265// Helper macros to strip parens from a macro argument.
266#define __Args(...) __VA_ARGS__
267#define __STRIP_PARENS(__X) __X
268#define __L(__X) __STRIP_PARENS(__Args __X)
269
270// Construct inline assembly output args.
271// Results are stored in a temp var __r.
272// isResident bool is pointed to by __ir
273// Asm args for return values. It's a 4-element vector
274#define __ASM_OUT(__t) \
275 ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
276// .. possibly combined with a predicate.
277#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
278
279// Implements a single variant of texture fetch instruction.
280#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \
281 template <> \
282 __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \
283 __rt __r; \
284 asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \
285 return __r; \
286 }
287
288// Implements texture fetch instructions for int4/uint4/float4 data types.
289#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
290 __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
291 __ASM_OUT("r"), __asm_args) \
292 __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
293 __ASM_OUT("r"), __asm_args) \
294 __IMPL_F1(float4, float4, __args, \
295 __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \
296 __asm_args)
297// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
298// types. Similar to above, but returns a boolean 'isPresent' value in addition
299// to texture data,
300#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
301 __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
302 __ASM_OUTP("r"), __asm_args) \
303 __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
304 __ASM_OUTP("r"), __asm_args) \
305 __IMPL_F1(float4, float4, __args, \
306 __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \
307 __asm_args)
308
309// Similar to F3, but for integer data which is returned as normalized floats.
310// Only instantiates fetch functions for int4/uint4.
311#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
312 __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
313 __ASM_OUT("r"), __asm_args) \
314 __IMPL_F1(float4, uint4, __args, \
315 __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \
316 __asm_args)
317
318// Instantiates __tex_fetch_v4 with regular fetch functions.
319#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
320 template <> struct __tex_fetch_v4<__op> { \
321 template <class T> \
322 __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
323 __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
324 }
325
326// Same, but for sparse ops. Only available on sm_60+
327#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
328#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \
329 __asm_args) \
330 template <> struct __tex_fetch_v4<__op> { \
331 template <class T> \
332 __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
333 __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
334 }
335#else
336#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
337#endif
338
339// Same, but for normalized float ops.
340#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \
341 __asm_args) \
342 template <> struct __tex_fetch_v4<__op> { \
343 template <class T> \
344 __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \
345 __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
346 }
347
348// Regular and normalized float ops share a lot of similarities. This macro
349// instantiates both variants -- normal for __op and normalized for __opn.
350#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
351 __asm_args) \
352 __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \
353 __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
354
355// Convenience macros which converts string literal __op into a __Tag,
356#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
357 __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
358#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
359 __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
360#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
361 __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
362#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
363 __asm_args) \
364 __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
365 __asm_args)
366
367// CUDA headers have some 'legacy' texture oprerations that duplicate
368// functionality. So, we just inherit it, instead of refining a copy.
369#define __IMPL_ALIASI(__op, __opn) \
370 template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
371#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
372
373// Now we can instantiate everything we need for each specific texture fetch
374// variant.
375__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
376 "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
377__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
378 "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
379__IMPL_ALIAS("__itex1D", "__tex1D_v2");
380__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
381
382__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
383 (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
384 "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
385 ("f"(__x), "f"(__dPdx), "f"(__dPdy)));
386__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
387
388__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
389 (float __x, int __layer), "tex.a1d.v4", "f32",
390 "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
391__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
392
393__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
394 (float __x, int __layer, float __dPdx, float __dPdy),
395 "tex.grad.a1d.v4", "f32",
396 "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
397 ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
398__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
399
400__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
401 (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
402 "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
403 ("r"(__layer), "f"(__x), "f"(__level)));
404__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
405
406__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
407 "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
408 ("f"(__x), "f"(__level)));
409__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
410
411// 2D
412__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
413 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
414__IMPL_ALIAS("__itex2D", "__tex2D_v2");
415
416__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
417 "{.reg .pred %%p0;\n\t"
418 "tex.2d.v4",
419 "f32",
420 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
421 " selp.u16 %4, 1, 0, %%p0; }",
422 ("f"(__x), "f"(__y)));
423
424__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
425 (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
426 "tex.grad.2d.v4", "f32",
427 "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
428 ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
429 "f"(__dPdy->y)));
430__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
431
432__IMPL_S3S("__itex2DGrad_sparse",
433 (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
434 unsigned char *__ir),
435 "{.reg .pred %%p0;\n\t"
436 "tex.grad.2d.v4",
437 "f32",
438 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
439 "selp.u16 %4, 1, 0, %%p0; }",
440 ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
441 "f"(__dPdy->y)));
442
443__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
444 (float __x, float __y, int __layer), "tex.a2d.v4", "f32",
445 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
446 ("r"(__layer), "f"(__x), "f"(__y)));
447__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
448
449__IMPL_S3S("__itex2DLayered_sparse",
450 (float __x, float __y, int __layer, unsigned char *__ir),
451 "{.reg .pred %%p0;\n\t"
452 "tex.a2d.v4",
453 "f32",
454 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
455 "selp.u16 %4, 1, 0, %%p0; }",
456 ("r"(__layer), "f"(__x), "f"(__y)));
457
458__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
459 (float __x, float __y, int __layer, const float2 *__dPdx,
460 const float2 *__dPdy),
461 "tex.grad.a2d.v4", "f32",
462 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
463 ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
464 "f"(__dPdy->x), "f"(__dPdy->y)));
465__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
466
468 "__itex2DLayeredGrad_sparse",
469 (float __x, float __y, int __layer, const float2 *__dPdx,
470 const float2 *__dPdy, unsigned char *__ir),
471 "{.reg .pred %%p0;\n\t"
472 "tex.grad.a2d.v4",
473 "f32",
474 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
475 "selp.u16 %4, 1, 0, %%p0; }",
476 ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
477 "f"(__dPdy->x), "f"(__dPdy->y)));
478
479__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
480 (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
481 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
482 ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
483__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
484
485__IMPL_S3S("__itex2DLayeredLod_sparse",
486 (float __x, float __y, int __layer, float __level,
487 unsigned char *__ir),
488 "{.reg .pred %%p0;\n\t"
489 "tex.level.a2d.v4",
490 "f32",
491 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
492 "selp.u16 %4, 1, 0, %%p0; }",
493 ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
494
495__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
496 (float __x, float __y, float __level), "tex.level.2d.v4", "f32",
497 "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
498 ("f"(__x), "f"(__y), "f"(__level)));
499__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
500
501__IMPL_S3S("__itex2DLod_sparse",
502 (float __x, float __y, float __level, unsigned char *__ir),
503 "{.reg .pred %%p0;\n\t"
504 "tex.level.2d.v4",
505 "f32",
506 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
507 "selp.u16 %4, 1, 0, %%p0; }",
508 ("f"(__x), "f"(__y), "f"(__level)));
509
510// 2D gather is special. Unlike other variants that translate into exactly one
511// asm instruction, it uses one of the four different instructions selected by
512// __comp. We implement each instruction variant separately, and dispatch the
513// right one from the manually implemented 'umbrella' fetch.
514#define __IMPL_2DGATHER(variant, instr) \
515 __IMPL_SI(__IDV("__tex2Dgather_v2", variant), \
516 __IDV("__tex2Dgather_rmnf_v2", variant), \
517 (float __x, float __y, int __comp), instr, "f32", \
518 "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \
519 __IMPL_ALIASI(__IDV("__itex2Dgather", variant), \
520 __IDV("__tex2Dgather_v2", variant)); \
521 __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \
522 (float __x, float __y, unsigned char *__ir, int __comp), \
523 "{.reg .pred %%p0;\n\t" instr, "f32", \
524 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" \
525 "selp.u16 %4, 1, 0, %%p0; }", \
526 ("f"(__x), "f"(__y)));
527__IMPL_2DGATHER(0, "tld4.r.2d.v4");
528__IMPL_2DGATHER(1, "tld4.g.2d.v4");
529__IMPL_2DGATHER(2, "tld4.b.2d.v4");
530__IMPL_2DGATHER(3, "tld4.a.2d.v4");
531
532// Umbrella dispatcher -- calls into specific 2Dgather variant.
533template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
534 template <class __T>
535 __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
536 int __comp) {
537 switch (__comp) {
538 case 0:
539 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
540 __obj, __x, __y, __comp);
541 case 1:
542 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
543 __obj, __x, __y, __comp);
544 case 2:
545 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
546 __obj, __x, __y, __comp);
547 case 3:
548 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
549 __obj, __x, __y, __comp);
550 }
551 }
552};
553__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
554
555template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
556 template <class __T>
557 __device__ static float4 __run(cudaTextureObject_t __obj, float __x,
558 float __y, int __comp) {
559 switch (__comp) {
560 case 0:
561 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
562 __obj, __x, __y, __comp);
563 case 1:
564 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
565 __obj, __x, __y, __comp);
566 case 2:
567 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
568 __obj, __x, __y, __comp);
569 case 3:
570 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
571 __obj, __x, __y, __comp);
572 }
573 }
574};
575
576#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
577template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
578 template <class __T>
579 __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
580 unsigned char *__ir, int __comp) {
581 switch (__comp) {
582 case 0:
583 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
584 __obj, __x, __y, __ir, __comp);
585 case 1:
586 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
587 __obj, __x, __y, __ir, __comp);
588 case 2:
589 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
590 __obj, __x, __y, __ir, __comp);
591 case 3:
592 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
593 __obj, __x, __y, __ir, __comp);
594 }
595 }
596};
597#endif
598
599// 3D
600__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
601 "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
602 ("f"(__x), "f"(__y), "f"(__z)));
603__IMPL_ALIAS("__itex3D", "__tex3D_v2");
604
605__IMPL_S3S("__itex3D_sparse",
606 (float __x, float __y, float __z, unsigned char *__ir),
607 "{.reg .pred %%p0;\n\t"
608 "tex.3d.v4",
609 "f32",
610 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
611 "selp.u16 %4, 1, 0, %%p0; }",
612 ("f"(__x), "f"(__y), "f"(__z)));
613
614__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
615 (float __x, float __y, float __z, const float4 *__dPdx,
616 const float4 *__dPdy),
617 "tex.grad.3d.v4", "f32",
618 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
619 "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
620 ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
621 "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
622__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
623
624__IMPL_S3S("__itex3DGrad_sparse",
625 (float __x, float __y, float __z, const float4 *__dPdx,
626 const float4 *__dPdy, unsigned char *__ir),
627 "{.reg .pred %%p0;\n\t"
628 "tex.grad.3d.v4",
629 "f32",
630 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
631 "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
632 "selp.u16 %4, 1, 0, %%p0; }",
633 ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
634 "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
635
636__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
637 (float __x, float __y, float __z, float __level), "tex.level.3d.v4",
638 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
639 ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
640__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
641
642__IMPL_S3S("__itex3DLod_sparse",
643 (float __x, float __y, float __z, float __level,
644 unsigned char *__ir),
645 "{.reg .pred %%p0;\n\t"
646 "tex.level.3d.v4",
647 "f32",
648 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
649 "selp.u16 %4, 1, 0, %%p0; }",
650 ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
651
652// Cubemap
653__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
654 (float __x, float __y, float __z), "tex.cube.v4", "f32",
655 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
656 ("f"(__x), "f"(__y), "f"(__z)));
657__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
658
659__IMPL_S3S("__itexCubemap_sparse",
660 (float __x, float __y, float __z, unsigned char *__ir),
661 "{.reg .pred %%p0;\n\t"
662 "tex.cube.v4",
663 "f32",
664 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
665 "selp.u16 %4, 1, 0, %%p0; }",
666 ("f"(__x), "f"(__y), "f"(__z)));
667
668__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
669 (float __x, float __y, float __z, const float4 *__dPdx,
670 const float4 *__dPdy),
671 "tex.grad.cube.v4", "f32",
672 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
673 "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
674 ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
675 "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
676__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
677
678__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
679 (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
680 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
681 ("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
682__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
683
684__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
685 (float __x, float __y, float __z, int __layer, const float4 *__dPdx,
686 const float4 *__dPdy),
687 "tex.grad.acube.v4", "f32",
688 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
689 "{%9, %10, %11, %11}, {%12, %13, %14, %14};",
690 ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
691 "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
692 "f"(__dPdy->z)));
693__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
694
695__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
696 (float __x, float __y, float __z, int __layer, float __level),
697 "tex.level.acube.v4", "f32",
698 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
699 ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
700__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
701
702__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
703 (float __x, float __y, float __z, float __level), "tex.level.cube.v4",
704 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
705 ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
706__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
707
708// Helper class for extracting slice of data from V4 fetch results.
709template <class __DestT, class __SrcT> struct __convert {
710 template <int __NElements = sizeof(__DestT) /
711 sizeof(typename __TypeInfoT<__DestT>::__base_t)>
712 __device__ static __DestT __run(__SrcT __v);
713 template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
714 template <> __device__ static __DestT __run<2>(__SrcT __v) {
715 return {__v.x, __v.y};
716 }
717 template <> __device__ static __DestT __run<3>(__SrcT __v) {
718 return {__v.x, __v.y, __v.z};
719 }
720 template <> __device__ static __DestT __run<4>(__SrcT __v) {
721 return {__v.x, __v.y, __v.z, __v.w};
722 }
723};
724
725// There are a couple of layers here. First, __op_type_traits is used to
726// dispatch to either surface write calls, or to the texture read calls.
727//
728// Then, that dispatches to __tex_fetch_impl below, which dispatches by both tag
729// and datatype to the appropriate
730// __surf_read_write_v2.
731// TODO(austin): Do the reads too.
732
733// Mark which of the ids we should be dispatching to surface write calls.
734__OP_TYPE_SURFACE(__ID("__isurf1Dread"));
735__OP_TYPE_SURFACE(__ID("__isurf2Dread"));
736__OP_TYPE_SURFACE(__ID("__isurf3Dread"));
737__OP_TYPE_SURFACE(__ID("__isurf1DLayeredread"));
738__OP_TYPE_SURFACE(__ID("__isurf2DLayeredread"));
739__OP_TYPE_SURFACE(__ID("__isurfCubemapread"));
740__OP_TYPE_SURFACE(__ID("__isurfCubemapLayeredread"));
741__OP_TYPE_SURFACE(__ID("__isurf1Dwrite_v2"));
742__OP_TYPE_SURFACE(__ID("__isurf2Dwrite_v2"));
743__OP_TYPE_SURFACE(__ID("__isurf3Dwrite_v2"));
744__OP_TYPE_SURFACE(__ID("__isurf1DLayeredwrite_v2"));
745__OP_TYPE_SURFACE(__ID("__isurf2DLayeredwrite_v2"));
746__OP_TYPE_SURFACE(__ID("__isurfCubemapwrite_v2"));
747__OP_TYPE_SURFACE(__ID("__isurfCubemapLayeredwrite_v2"));
748
749template <class __op, typename __type> struct __surf_read_write_v2;
750
751// For the various write calls, we need to be able to generate variations with
752// different IDs, different numbers of arguments, and different numbers of
753// outputs.
754
755#define __SURF_WRITE_V2(__op, __asm_dim, __asmtype, __type, __index_op_args, \
756 __index_args, __index_asm_args, __asm_op_args, \
757 __asm_args) \
758 template <> struct __surf_read_write_v2<__op, __type> { \
759 static __device__ void __run(__type *__ptr, cudaSurfaceObject_t obj, \
760 __L(__index_args), \
761 cudaSurfaceBoundaryMode mode) { \
762 switch (mode) { \
763 case cudaBoundaryModeZero: \
764 asm volatile("sust.b." __asm_dim "." __asmtype \
765 ".zero [%0, " __index_op_args "], " __asm_op_args ";" \
766 : \
767 : "l"(obj), __L(__index_asm_args), __L(__asm_args)); \
768 break; \
769 case cudaBoundaryModeClamp: \
770 asm volatile("sust.b." __asm_dim "." __asmtype \
771 ".clamp [%0, " __index_op_args "], " __asm_op_args ";" \
772 : \
773 : "l"(obj), __L(__index_asm_args), __L(__asm_args)); \
774 break; \
775 case cudaBoundaryModeTrap: \
776 asm volatile("sust.b." __asm_dim "." __asmtype \
777 ".trap [%0, " __index_op_args "], " __asm_op_args ";" \
778 : \
779 : "l"(obj), __L(__index_asm_args), __L(__asm_args)); \
780 break; \
781 } \
782 } \
783 }
784
785#define __SURF_READ_V2(__op, __asm_dim, __asmtype, __type, __asm_op_args, \
786 __asm_args, __index_args, __index_asm_args) \
787 template <> struct __surf_read_write_v2<__op, __type> { \
788 static __device__ void __run(__type *__ptr, cudaSurfaceObject_t obj, \
789 __L(__index_args), \
790 cudaSurfaceBoundaryMode mode) { \
791 switch (mode) { \
792 case cudaBoundaryModeZero: \
793 asm("suld.b." __asm_dim "." __asmtype ".zero " __asm_op_args ";" \
794 : __L(__asm_args) \
795 : "l"(obj), __L(__index_asm_args)); \
796 break; \
797 case cudaBoundaryModeClamp: \
798 asm("suld.b." __asm_dim "." __asmtype ".clamp " __asm_op_args ";" \
799 : __L(__asm_args) \
800 : "l"(obj), __L(__index_asm_args)); \
801 break; \
802 case cudaBoundaryModeTrap: \
803 asm("suld.b." __asm_dim "." __asmtype ".trap " __asm_op_args ";" \
804 : __L(__asm_args) \
805 : "l"(obj), __L(__index_asm_args)); \
806 break; \
807 } \
808 } \
809 }
810
811// Amazing, the read side should follow the same flow, I just need to change the
812// generated assembly calls, and the rest should fall in line.
813
814#define __SW_ASM_ARGS(__type) (__type(*__ptr))
815#define __SW_ASM_ARGS1(__type) (__type(__ptr->x))
816#define __SW_ASM_ARGS2(__type) (__type(__ptr->x), __type(__ptr->y))
817#define __SW_ASM_ARGS4(__type) \
818 (__type(__ptr->x), __type(__ptr->y), __type(__ptr->z), __type(__ptr->w))
819
820#define __SURF_READ1D(__asmtype, __type, __asm_op_args, __asm_args) \
821 __SURF_READ_V2(__ID("__isurf1Dread"), "1d", __asmtype, __type, \
822 __asm_op_args, __asm_args, (int x), ("r"(x)))
823#define __SURF_READ2D(__asmtype, __type, __asm_op_args, __asm_args) \
824 __SURF_READ_V2(__ID("__isurf2Dread"), "2d", __asmtype, __type, \
825 __asm_op_args, __asm_args, (int x, int y), ("r"(x), "r"(y)))
826#define __SURF_READ3D(__asmtype, __type, __asm_op_args, __asm_args) \
827 __SURF_READ_V2(__ID("__isurf3Dread"), "3d", __asmtype, __type, \
828 __asm_op_args, __asm_args, (int x, int y, int z), \
829 ("r"(x), "r"(y), "r"(z)))
830
831#define __SURF_READ1DLAYERED(__asmtype, __type, __asm_op_args, __asm_args) \
832 __SURF_READ_V2(__ID("__isurf1DLayeredread"), "a1d", __asmtype, __type, \
833 __asm_op_args, __asm_args, (int x, int layer), \
834 ("r"(x), "r"(layer)))
835#define __SURF_READ2DLAYERED(__asmtype, __type, __asm_op_args, __asm_args) \
836 __SURF_READ_V2(__ID("__isurf2DLayeredread"), "a2d", __asmtype, __type, \
837 __asm_op_args, __asm_args, (int x, int y, int layer), \
838 ("r"(x), "r"(y), "r"(layer)))
839#define __SURF_READCUBEMAP(__asmtype, __type, __asm_op_args, __asm_args) \
840 __SURF_READ_V2(__ID("__isurfCubemapread"), "a2d", __asmtype, __type, \
841 __asm_op_args, __asm_args, (int x, int y, int face), \
842 ("r"(x), "r"(y), "r"(face)))
843#define __SURF_READCUBEMAPLAYERED(__asmtype, __type, __asm_op_args, \
844 __asm_args) \
845 __SURF_READ_V2(__ID("__isurfCubemapLayeredread"), "a2d", __asmtype, __type, \
846 __asm_op_args, __asm_args, (int x, int y, int layerface), \
847 ("r"(x), "r"(y), "r"(layerface)))
848
849#define __1DV1 "{%0}, [%1, {%2}]"
850#define __1DV2 "{%0, %1}, [%2, {%3}]"
851#define __1DV4 "{%0, %1, %2, %3}, [%4, {%5}]"
852
853#define __2DV1 "{%0}, [%1, {%2, %3}]"
854#define __2DV2 "{%0, %1}, [%2, {%3, %4}]"
855#define __2DV4 "{%0, %1, %2, %3}, [%4, {%5, %6}]"
856
857#define __1DLAYERV1 "{%0}, [%1, {%3, %2}]"
858#define __1DLAYERV2 "{%0, %1}, [%2, {%4, %3}]"
859#define __1DLAYERV4 "{%0, %1, %2, %3}, [%4, {%6, %5}]"
860
861#define __3DV1 "{%0}, [%1, {%2, %3, %4, %4}]"
862#define __3DV2 "{%0, %1}, [%2, {%3, %4, %5, %5}]"
863#define __3DV4 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}]"
864
865#define __2DLAYERV1 "{%0}, [%1, {%4, %2, %3, %3}]"
866#define __2DLAYERV2 "{%0, %1}, [%2, {%5, %3, %4, %4}]"
867#define __2DLAYERV4 "{%0, %1, %2, %3}, [%4, {%7, %5, %6, %6}]"
868
869#define __CUBEMAPV1 "{%0}, [%1, {%4, %2, %3, %3}]"
870#define __CUBEMAPV2 "{%0, %1}, [%2, {%5, %3, %4, %4}]"
871#define __CUBEMAPV4 "{%0, %1, %2, %3}, [%4, {%7, %5, %6, %6}]"
872
873#define __CUBEMAPLAYERV1 "{%0}, [%1, {%4, %2, %3, %3}]"
874#define __CUBEMAPLAYERV2 "{%0, %1}, [%2, {%5, %3, %4, %4}]"
875#define __CUBEMAPLAYERV4 "{%0, %1, %2, %3}, [%4, {%7, %5, %6, %6}]"
876
877#define __SURF_READXD_ALL(__xdv1, __xdv2, __xdv4, __surf_readxd_v2) \
878 __surf_readxd_v2("b8", char, __xdv1, __SW_ASM_ARGS("=h")); \
879 __surf_readxd_v2("b8", signed char, __xdv1, __SW_ASM_ARGS("=h")); \
880 __surf_readxd_v2("b8", char1, __xdv1, __SW_ASM_ARGS1("=h")); \
881 __surf_readxd_v2("b8", unsigned char, __xdv1, __SW_ASM_ARGS("=h")); \
882 __surf_readxd_v2("b8", uchar1, __xdv1, __SW_ASM_ARGS1("=h")); \
883 __surf_readxd_v2("b16", short, __xdv1, __SW_ASM_ARGS("=h")); \
884 __surf_readxd_v2("b16", short1, __xdv1, __SW_ASM_ARGS1("=h")); \
885 __surf_readxd_v2("b16", unsigned short, __xdv1, __SW_ASM_ARGS("=h")); \
886 __surf_readxd_v2("b16", ushort1, __xdv1, __SW_ASM_ARGS1("=h")); \
887 __surf_readxd_v2("b32", int, __xdv1, __SW_ASM_ARGS("=r")); \
888 __surf_readxd_v2("b32", int1, __xdv1, __SW_ASM_ARGS1("=r")); \
889 __surf_readxd_v2("b32", unsigned int, __xdv1, __SW_ASM_ARGS("=r")); \
890 __surf_readxd_v2("b32", uint1, __xdv1, __SW_ASM_ARGS1("=r")); \
891 __surf_readxd_v2("b64", long long, __xdv1, __SW_ASM_ARGS("=l")); \
892 __surf_readxd_v2("b64", longlong1, __xdv1, __SW_ASM_ARGS1("=l")); \
893 __surf_readxd_v2("b64", unsigned long long, __xdv1, __SW_ASM_ARGS("=l")); \
894 __surf_readxd_v2("b64", ulonglong1, __xdv1, __SW_ASM_ARGS1("=l")); \
895 __surf_readxd_v2("b32", float, __xdv1, __SW_ASM_ARGS("=r")); \
896 __surf_readxd_v2("b32", float1, __xdv1, __SW_ASM_ARGS1("=r")); \
897 \
898 __surf_readxd_v2("v2.b8", char2, __xdv2, __SW_ASM_ARGS2("=h")); \
899 __surf_readxd_v2("v2.b8", uchar2, __xdv2, __SW_ASM_ARGS2("=h")); \
900 __surf_readxd_v2("v2.b16", short2, __xdv2, __SW_ASM_ARGS2("=h")); \
901 __surf_readxd_v2("v2.b16", ushort2, __xdv2, __SW_ASM_ARGS2("=h")); \
902 __surf_readxd_v2("v2.b32", int2, __xdv2, __SW_ASM_ARGS2("=r")); \
903 __surf_readxd_v2("v2.b32", uint2, __xdv2, __SW_ASM_ARGS2("=r")); \
904 __surf_readxd_v2("v2.b64", longlong2, __xdv2, __SW_ASM_ARGS2("=l")); \
905 __surf_readxd_v2("v2.b64", ulonglong2, __xdv2, __SW_ASM_ARGS2("=l")); \
906 __surf_readxd_v2("v2.b32", float2, __xdv2, __SW_ASM_ARGS2("=r")); \
907 \
908 __surf_readxd_v2("v4.b8", char4, __xdv4, __SW_ASM_ARGS4("=h")); \
909 __surf_readxd_v2("v4.b8", uchar4, __xdv4, __SW_ASM_ARGS4("=h")); \
910 __surf_readxd_v2("v4.b16", short4, __xdv4, __SW_ASM_ARGS4("=h")); \
911 __surf_readxd_v2("v4.b16", ushort4, __xdv4, __SW_ASM_ARGS4("=h")); \
912 __surf_readxd_v2("v4.b32", int4, __xdv4, __SW_ASM_ARGS4("=r")); \
913 __surf_readxd_v2("v4.b32", uint4, __xdv4, __SW_ASM_ARGS4("=r")); \
914 __surf_readxd_v2("v4.b32", float4, __xdv4, __SW_ASM_ARGS4("=r"))
915
924
925#define __SURF_WRITE1D_V2(__asmtype, __type, __asm_op_args, __asm_args) \
926 __SURF_WRITE_V2(__ID("__isurf1Dwrite_v2"), "1d", __asmtype, __type, "{%1}", \
927 (int x), ("r"(x)), __asm_op_args, __asm_args)
928#define __SURF_WRITE1DLAYERED_V2(__asmtype, __type, __asm_op_args, __asm_args) \
929 __SURF_WRITE_V2(__ID("__isurf1DLayeredwrite_v2"), "a1d", __asmtype, __type, \
930 "{%2, %1}", (int x, int layer), ("r"(x), "r"(layer)), \
931 __asm_op_args, __asm_args)
932#define __SURF_WRITE2D_V2(__asmtype, __type, __asm_op_args, __asm_args) \
933 __SURF_WRITE_V2(__ID("__isurf2Dwrite_v2"), "2d", __asmtype, __type, \
934 "{%1, %2}", (int x, int y), ("r"(x), "r"(y)), __asm_op_args, \
935 __asm_args)
936#define __SURF_WRITE2DLAYERED_V2(__asmtype, __type, __asm_op_args, __asm_args) \
937 __SURF_WRITE_V2(__ID("__isurf2DLayeredwrite_v2"), "a2d", __asmtype, __type, \
938 "{%3, %1, %2, %2}", (int x, int y, int layer), \
939 ("r"(x), "r"(y), "r"(layer)), __asm_op_args, __asm_args)
940#define __SURF_WRITE3D_V2(__asmtype, __type, __asm_op_args, __asm_args) \
941 __SURF_WRITE_V2(__ID("__isurf3Dwrite_v2"), "3d", __asmtype, __type, \
942 "{%1, %2, %3, %3}", (int x, int y, int z), \
943 ("r"(x), "r"(y), "r"(z)), __asm_op_args, __asm_args)
944
945#define __SURF_CUBEMAPWRITE_V2(__asmtype, __type, __asm_op_args, __asm_args) \
946 __SURF_WRITE_V2(__ID("__isurfCubemapwrite_v2"), "a2d", __asmtype, __type, \
947 "{%3, %1, %2, %2}", (int x, int y, int face), \
948 ("r"(x), "r"(y), "r"(face)), __asm_op_args, __asm_args)
949#define __SURF_CUBEMAPLAYEREDWRITE_V2(__asmtype, __type, __asm_op_args, \
950 __asm_args) \
951 __SURF_WRITE_V2(__ID("__isurfCubemapLayeredwrite_v2"), "a2d", __asmtype, \
952 __type, "{%3, %1, %2, %2}", (int x, int y, int layerface), \
953 ("r"(x), "r"(y), "r"(layerface)), __asm_op_args, __asm_args)
954
955#define __SURF_WRITEXD_V2_ALL(__xdv1, __xdv2, __xdv4, __surf_writexd_v2) \
956 __surf_writexd_v2("b8", char, __xdv1, __SW_ASM_ARGS("h")); \
957 __surf_writexd_v2("b8", signed char, __xdv1, __SW_ASM_ARGS("h")); \
958 __surf_writexd_v2("b8", char1, __xdv1, __SW_ASM_ARGS1("h")); \
959 __surf_writexd_v2("b8", unsigned char, __xdv1, __SW_ASM_ARGS("h")); \
960 __surf_writexd_v2("b8", uchar1, __xdv1, __SW_ASM_ARGS1("h")); \
961 __surf_writexd_v2("b16", short, __xdv1, __SW_ASM_ARGS("h")); \
962 __surf_writexd_v2("b16", short1, __xdv1, __SW_ASM_ARGS1("h")); \
963 __surf_writexd_v2("b16", unsigned short, __xdv1, __SW_ASM_ARGS("h")); \
964 __surf_writexd_v2("b16", ushort1, __xdv1, __SW_ASM_ARGS1("h")); \
965 __surf_writexd_v2("b32", int, __xdv1, __SW_ASM_ARGS("r")); \
966 __surf_writexd_v2("b32", int1, __xdv1, __SW_ASM_ARGS1("r")); \
967 __surf_writexd_v2("b32", unsigned int, __xdv1, __SW_ASM_ARGS("r")); \
968 __surf_writexd_v2("b32", uint1, __xdv1, __SW_ASM_ARGS1("r")); \
969 __surf_writexd_v2("b64", long long, __xdv1, __SW_ASM_ARGS("l")); \
970 __surf_writexd_v2("b64", longlong1, __xdv1, __SW_ASM_ARGS1("l")); \
971 __surf_writexd_v2("b64", unsigned long long, __xdv1, __SW_ASM_ARGS("l")); \
972 __surf_writexd_v2("b64", ulonglong1, __xdv1, __SW_ASM_ARGS1("l")); \
973 __surf_writexd_v2("b32", float, __xdv1, __SW_ASM_ARGS("r")); \
974 __surf_writexd_v2("b32", float1, __xdv1, __SW_ASM_ARGS1("r")); \
975 \
976 __surf_writexd_v2("v2.b8", char2, __xdv2, __SW_ASM_ARGS2("h")); \
977 __surf_writexd_v2("v2.b8", uchar2, __xdv2, __SW_ASM_ARGS2("h")); \
978 __surf_writexd_v2("v2.b16", short2, __xdv2, __SW_ASM_ARGS2("h")); \
979 __surf_writexd_v2("v2.b16", ushort2, __xdv2, __SW_ASM_ARGS2("h")); \
980 __surf_writexd_v2("v2.b32", int2, __xdv2, __SW_ASM_ARGS2("r")); \
981 __surf_writexd_v2("v2.b32", uint2, __xdv2, __SW_ASM_ARGS2("r")); \
982 __surf_writexd_v2("v2.b64", longlong2, __xdv2, __SW_ASM_ARGS2("l")); \
983 __surf_writexd_v2("v2.b64", ulonglong2, __xdv2, __SW_ASM_ARGS2("l")); \
984 __surf_writexd_v2("v2.b32", float2, __xdv2, __SW_ASM_ARGS2("r")); \
985 \
986 __surf_writexd_v2("v4.b8", char4, __xdv4, __SW_ASM_ARGS4("h")); \
987 __surf_writexd_v2("v4.b8", uchar4, __xdv4, __SW_ASM_ARGS4("h")); \
988 __surf_writexd_v2("v4.b16", short4, __xdv4, __SW_ASM_ARGS4("h")); \
989 __surf_writexd_v2("v4.b16", ushort4, __xdv4, __SW_ASM_ARGS4("h")); \
990 __surf_writexd_v2("v4.b32", int4, __xdv4, __SW_ASM_ARGS4("r")); \
991 __surf_writexd_v2("v4.b32", uint4, __xdv4, __SW_ASM_ARGS4("r")); \
992 __surf_writexd_v2("v4.b32", float4, __xdv4, __SW_ASM_ARGS4("r"))
993
994#define __1DV1 "{%2}"
995#define __1DV2 "{%2, %3}"
996#define __1DV4 "{%2, %3, %4, %5}"
997
998#define __2DV1 "{%3}"
999#define __2DV2 "{%3, %4}"
1000#define __2DV4 "{%3, %4, %5, %6}"
1001
1002#define __3DV1 "{%4}"
1003#define __3DV2 "{%4, %5}"
1004#define __3DV4 "{%4, %5, %6, %7}"
1005
1013
1014template <class __op, class __DataT, class... __Args>
1015__device__ static void __tex_fetch_impl(__surface_op_tag, __DataT *__ptr,
1016 cudaSurfaceObject_t __handle,
1017 __Args... __args) {
1018 __surf_read_write_v2<__op, __DataT>::__run(__ptr, __handle, __args...);
1019}
1020
1021// These are the top-level function overloads the __nv_tex_surf_handler expands
1022// to. Each overload deals with one of the several ways __nv_tex_surf_handler
1023// is called by CUDA headers. In the end, each of the overloads does the same
1024// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
1025// fetch texture data and which `__convert::run` is needed to convert it into
1026// appropriate return type.
1027
1028// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
1029// Data type and return type are based on ret.
1030template <class __op, class __T, class... __Args>
1031__device__ static void __tex_fetch_impl(__texture_op_tag, __T *__ptr,
1032 cudaTextureObject_t __handle,
1033 __Args... __args) {
1034 using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
1035 *__ptr = __convert<__T, __FetchT>::__run(
1036 __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
1037}
1038
1039template <class __op, class __T, class... __Args>
1040__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
1041 __Args... __args) {
1042 using op_type = typename __op_type_traits<__op>::type;
1043 __tex_fetch_impl<__op>(op_type{}, __ptr, __handle, __args...);
1044}
1045
1046#if CUDA_VERSION < 12000
1047// texture<> objects get magically converted into a texture reference. However,
1048// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
1049// cheat a bit and use inline assembly to do it. It costs us an extra register
1050// and a move, but that is easy for ptxas to optimize away.
1051template <class __T>
1052__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
1053 cudaTextureObject_t __obj;
1054 asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
1055 return __obj;
1056}
1057
1058// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
1059// Data type and return type is based on ret.
1060template <class __op, class __T, class __HandleT, class... __Args>
1061__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
1062 __Args... __args) {
1063 using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
1064 *__ptr = __convert<__T, __FetchT>::__run(
1065 __tex_fetch_v4<__op>::template __run<__FetchT>(
1066 __tex_handle_to_obj(__handle), __args...));
1067}
1068
1069// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
1070// cudaReadModeNormalizedFloat fetches always return float4.
1071template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
1072__device__ static void
1073__tex_fetch(__DataT *, __RetT *__ptr,
1074 texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
1075 __Args... __args) {
1076 using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
1077 *__ptr = __convert<__RetT, float4>::__run(
1078 __tex_fetch_v4<__op>::template __run<__FetchT>(
1079 __tex_handle_to_obj(__handle), __args...));
1080}
1081
1082// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
1083// For cudaReadModeElementType fetch return type is based on type_dummy.
1084template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
1085__device__ static void
1086__tex_fetch(__DataT *, __RetT *__ptr,
1087 texture<__DataT, __TexT, cudaReadModeElementType> __handle,
1088 __Args... __args) {
1089 using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
1090 *__ptr = __convert<__RetT, __FetchT>::__run(
1091 __tex_fetch_v4<__op>::template __run<__FetchT>(
1092 __tex_handle_to_obj(__handle), __args...));
1093}
1094#endif // CUDA_VERSION
1095} // namespace __cuda_tex
1096} // namespace
1097#pragma pop_macro("__ASM_OUT")
1098#pragma pop_macro("__ASM_OUTP")
1099#pragma pop_macro("__Args")
1100#pragma pop_macro("__ID")
1101#pragma pop_macro("__IDV")
1102#pragma pop_macro("__OP_TYPE_SURFACE")
1103#pragma pop_macro("__IMPL_2DGATHER")
1104#pragma pop_macro("__IMPL_ALIAS")
1105#pragma pop_macro("__IMPL_ALIASI")
1106#pragma pop_macro("__IMPL_F1")
1107#pragma pop_macro("__IMPL_F3")
1108#pragma pop_macro("__IMPL_F3N")
1109#pragma pop_macro("__IMPL_F3S")
1110#pragma pop_macro("__IMPL_S")
1111#pragma pop_macro("__IMPL_S3")
1112#pragma pop_macro("__IMPL_S3I")
1113#pragma pop_macro("__IMPL_S3N")
1114#pragma pop_macro("__IMPL_S3NI")
1115#pragma pop_macro("__IMPL_S3S")
1116#pragma pop_macro("__IMPL_S3SI")
1117#pragma pop_macro("__IMPL_SI")
1118#pragma pop_macro("__L")
1119#pragma pop_macro("__STRIP_PARENS")
1120#pragma pop_macro("__SURF_WRITE_V2")
1121#pragma pop_macro("__SW_ASM_ARGS")
1122#pragma pop_macro("__SW_ASM_ARGS1")
1123#pragma pop_macro("__SW_ASM_ARGS2")
1124#pragma pop_macro("__SW_ASM_ARGS4")
1125#pragma pop_macro("__SURF_WRITE_V2")
1126#pragma pop_macro("__SURF_READ_V2")
1127#pragma pop_macro("__SW_ASM_ARGS")
1128#pragma pop_macro("__SW_ASM_ARGS1")
1129#pragma pop_macro("__SW_ASM_ARGS2")
1130#pragma pop_macro("__SW_ASM_ARGS4")
1131#pragma pop_macro("__SURF_READ1D");
1132#pragma pop_macro("__SURF_READ2D");
1133#pragma pop_macro("__SURF_READ3D");
1134#pragma pop_macro("__SURF_READ1DLAYERED");
1135#pragma pop_macro("__SURF_READ2DLAYERED");
1136#pragma pop_macro("__SURF_READCUBEMAP");
1137#pragma pop_macro("__SURF_READCUBEMAPLAYERED");
1138#pragma pop_macro("__1DV1");
1139#pragma pop_macro("__1DV2");
1140#pragma pop_macro("__1DV4");
1141#pragma pop_macro("__2DV1");
1142#pragma pop_macro("__2DV2");
1143#pragma pop_macro("__2DV4");
1144#pragma pop_macro("__1DLAYERV1");
1145#pragma pop_macro("__1DLAYERV2");
1146#pragma pop_macro("__1DLAYERV4");
1147#pragma pop_macro("__3DV1");
1148#pragma pop_macro("__3DV2");
1149#pragma pop_macro("__3DV4");
1150#pragma pop_macro("__2DLAYERV1");
1151#pragma pop_macro("__2DLAYERV2");
1152#pragma pop_macro("__2DLAYERV4");
1153#pragma pop_macro("__CUBEMAPV1");
1154#pragma pop_macro("__CUBEMAPV2");
1155#pragma pop_macro("__CUBEMAPV4");
1156#pragma pop_macro("__CUBEMAPLAYERV1");
1157#pragma pop_macro("__CUBEMAPLAYERV2");
1158#pragma pop_macro("__CUBEMAPLAYERV4");
1159#pragma pop_macro("__SURF_READXD_ALL");
1160#pragma pop_macro("__SURF_WRITE1D_V2");
1161#pragma pop_macro("__SURF_WRITE1DLAYERED_V2");
1162#pragma pop_macro("__SURF_WRITE2D_V2");
1163#pragma pop_macro("__SURF_WRITE2DLAYERED_V2");
1164#pragma pop_macro("__SURF_WRITE3D_V2");
1165#pragma pop_macro("__SURF_CUBEMAPWRITE_V2");
1166#pragma pop_macro("__SURF_CUBEMAPLAYEREDWRITE_V2");
1167#pragma pop_macro("__SURF_WRITEXD_V2_ALL");
1168#pragma pop_macro("__1DV1");
1169#pragma pop_macro("__1DV2");
1170#pragma pop_macro("__1DV4");
1171#pragma pop_macro("__2DV1");
1172#pragma pop_macro("__2DV2");
1173#pragma pop_macro("__2DV4");
1174#pragma pop_macro("__3DV1");
1175#pragma pop_macro("__3DV2");
1176#pragma pop_macro("__3DV4");
1177#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__
#define __SURF_CUBEMAPWRITE_V2(__asmtype, __type, __asm_op_args, __asm_args)
#define __SURF_READ2DLAYERED(__asmtype, __type, __asm_op_args, __asm_args)
#define __SURF_READCUBEMAPLAYERED(__asmtype, __type, __asm_op_args, __asm_args)
#define __SURF_WRITE2D_V2(__asmtype, __type, __asm_op_args, __asm_args)
#define __ID(__op)
#define __SURF_READ1DLAYERED(__asmtype, __type, __asm_op_args, __asm_args)
#define __SURF_WRITE2DLAYERED_V2(__asmtype, __type, __asm_op_args, __asm_args)
#define __SURF_READ3D(__asmtype, __type, __asm_op_args, __asm_args)
#define __SURF_WRITE3D_V2(__asmtype, __type, __asm_op_args, __asm_args)
#define __SURF_READCUBEMAP(__asmtype, __type, __asm_op_args, __asm_args)
#define __SURF_WRITEXD_V2_ALL(__xdv1, __xdv2, __xdv4, __surf_writexd_v2)
#define __SURF_WRITE1D_V2(__asmtype, __type, __asm_op_args, __asm_args)
#define __SURF_READ1D(__asmtype, __type, __asm_op_args, __asm_args)
#define __SURF_CUBEMAPLAYEREDWRITE_V2(__asmtype, __type, __asm_op_args, __asm_args)
#define __Args(...)
#define __OP_TYPE_SURFACE(__op)
#define __SURF_READ2D(__asmtype, __type, __asm_op_args, __asm_args)
#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#define __IMPL_ALIAS(__op, __opn)
#define __IMPL_2DGATHER(variant, instr)
#define __SURF_READXD_ALL(__xdv1, __xdv2, __xdv4, __surf_readxd_v2)
#define __IDV(__op, __variant)
#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#define __SURF_WRITE1DLAYERED_V2(__asmtype, __type, __asm_op_args, __asm_args)
__device__ __2f16 float __ockl_bool s
__device__ __2f16 float c
__device__ float
#define __device__
return __v
Definition arm_acle.h:88
static __inline__ uint32_t uint32_t __y
Definition arm_acle.h:125
vector< float, 4 > float4
vector< int, 4 > int4
vector< uint, 4 > uint4