clang 20.0.0git
__clang_cuda_texture_intrinsics.h
Go to the documentation of this file.
1/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 *
9 * This header provides in-header implmentations for NVCC's built-in
10 * __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The
11 * built-in is unusual as it's actually a set of function overloads that use the
12 * first string literal argument as one of the overload parameters.
13 */
14#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
15#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
16#ifndef __CUDA__
17#error "This file is for CUDA compilation only."
18#endif
19
20// __nv_tex_surf_handler() provided by this header as a macro.
21#define __nv_tex_surf_handler(__op, __ptr, ...) \
22 ::__cuda_tex::__tex_fetch< \
23 ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \
24 __VA_ARGS__)
25
26#pragma push_macro("__ASM_OUT")
27#pragma push_macro("__ASM_OUTP")
28#pragma push_macro("__Args")
29#pragma push_macro("__ID")
30#pragma push_macro("__IDV")
31#pragma push_macro("__IMPL_2DGATHER")
32#pragma push_macro("__IMPL_ALIAS")
33#pragma push_macro("__IMPL_ALIASI")
34#pragma push_macro("__IMPL_F1")
35#pragma push_macro("__IMPL_F3")
36#pragma push_macro("__IMPL_F3N")
37#pragma push_macro("__IMPL_F3S")
38#pragma push_macro("__IMPL_S")
39#pragma push_macro("__IMPL_S3")
40#pragma push_macro("__IMPL_S3I")
41#pragma push_macro("__IMPL_S3N")
42#pragma push_macro("__IMPL_S3NI")
43#pragma push_macro("__IMPL_S3S")
44#pragma push_macro("__IMPL_S3SI")
45#pragma push_macro("__IMPL_SI")
46#pragma push_macro("__L")
47#pragma push_macro("__STRIP_PARENS")
48
49// Put all functions into anonymous namespace so they have internal linkage.
50// The device-only function here must be internal in order to avoid ODR
51// violations in case they are used from the files compiled with
52// -fgpu-rdc. E.g. a library and an app using it may be built with a different
53// version of this header file.
54namespace {
55
56// Put the implmentation into its own namespace so we don't pollute the TU.
57namespace __cuda_tex {
58
59// First, we need a perfect hash function and a few constexpr helper functions
60// for converting a string literal into a numeric value which can be used to
61// parametrize a template. We can not use string literals for that as that would
62// require C++20.
63//
64// The hash function was generated with 'gperf' and then manually converted into
65// its constexpr equivalent.
66//
67// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
68// function has a collision for any of the texture operations, the compilation
69// will fail due to an attempt to redefine a tag with the same value. If the
70// header compiles, then the hash function is good enough for the job.
71
72constexpr int __tex_len(const char *s) {
73 return (s[0] == 0) ? 0
74 : (s[1] == 0) ? 1
75 : (s[2] == 0) ? 2
76 : (s[3] == 0) ? 3
77 : (s[4] == 0) ? 4
78 : (s[5] == 0) ? 5
79 : (s[6] == 0) ? 6
80 : (s[7] == 0) ? 7
81 : (s[8] == 0) ? 8
82 : (s[9] == 0) ? 9
83 : (s[10] == 0) ? 10
84 : (s[11] == 0) ? 11
85 : (s[12] == 0) ? 12
86 : (s[13] == 0) ? 13
87 : (s[14] == 0) ? 14
88 : (s[15] == 0) ? 15
89 : (s[16] == 0) ? 16
90 : (s[17] == 0) ? 17
91 : (s[18] == 0) ? 18
92 : (s[19] == 0) ? 19
93 : (s[20] == 0) ? 20
94 : (s[21] == 0) ? 21
95 : (s[22] == 0) ? 22
96 : (s[23] == 0) ? 23
97 : (s[24] == 0) ? 24
98 : (s[25] == 0) ? 25
99 : (s[26] == 0) ? 26
100 : (s[27] == 0) ? 27
101 : (s[28] == 0) ? 28
102 : (s[29] == 0) ? 29
103 : (s[30] == 0) ? 30
104 : (s[31] == 0) ? 31
105 : 32;
106}
107
108constexpr int __tex_hash_map(int c) {
109 return (c == 49) ? 10
110 : (c == 50) ? 0
111 : (c == 51) ? 100
112 : (c == 52) ? 30
113 : (c == 67) ? 10
114 : (c == 68) ? 0
115 : (c == 69) ? 25
116 : (c == 72) ? 70
117 : (c == 77) ? 0
118 : (c == 96) ? 44
119 : (c == 99) ? 10
120 : (c == 100) ? 5
121 : (c == 101) ? 60
122 : (c == 102) ? 40
123 : (c == 103) ? 70
124 : (c == 104) ? 25
125 : (c == 112) ? 0
126 : (c == 114) ? 45
127 : (c == 117) ? 5
128 : (c == 118) ? 85
129 : (c == 120) ? 20
130 : 225;
131}
132
133constexpr int __tex_op_hash(const char *str) {
134 return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
135 __tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
136}
137
138// Tag type to identify particular texture operation.
139template <int N> struct __Tag;
140#define __ID(__op) __Tag<__tex_op_hash(__op)>
141// Tags for variants of particular operation. E.g. tex2Dgather can translate
142// into 4 different instructions.
143#define __IDV(__op, __variant) \
144 __Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
145
146// Helper classes for figuring out key data types for derived types.
147// E.g. char2 has __base_t = char, __fetch_t = char4
148template <class> struct __TypeInfoT;
149// Type info for the fundamental types.
150template <> struct __TypeInfoT<float> {
151 using __base_t = float;
152 using __fetch_t = float4;
153};
154template <> struct __TypeInfoT<char> {
155 using __base_t = char;
156 using __fetch_t = int4;
157};
158template <> struct __TypeInfoT<signed char> {
159 using __base_t = signed char;
160 using __fetch_t = int4;
161};
162template <> struct __TypeInfoT<unsigned char> {
163 using __base_t = unsigned char;
164 using __fetch_t = uint4;
165};
166template <> struct __TypeInfoT<short> {
167 using __base_t = short;
168 using __fetch_t = int4;
169};
170template <> struct __TypeInfoT<unsigned short> {
171 using __base_t = unsigned short;
172 using __fetch_t = uint4;
173};
174template <> struct __TypeInfoT<int> {
175 using __base_t = int;
176 using __fetch_t = int4;
177};
178template <> struct __TypeInfoT<unsigned int> {
179 using __base_t = unsigned int;
180 using __fetch_t = uint4;
181};
182
183// Derived base/fetch types for N-element vectors.
184template <class __T> struct __TypeInfoT {
185 using __base_t = decltype(__T::x);
186 using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
187};
188
189// Classes that implement specific texture ops.
190template <class __op> struct __tex_fetch_v4;
191
192// Helper macros to strip parens from a macro argument.
193#define __Args(...) __VA_ARGS__
194#define __STRIP_PARENS(__X) __X
195#define __L(__X) __STRIP_PARENS(__Args __X)
196
197// Construct inline assembly output args.
198// Results are stored in a temp var __r.
199// isResident bool is pointed to by __ir
200// Asm args for return values. It's a 4-element vector
201#define __ASM_OUT(__t) \
202 ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
203// .. possibly combined with a predicate.
204#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
205
206// Implements a single variant of texture fetch instruction.
207#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \
208 template <> \
209 __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \
210 __rt __r; \
211 asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \
212 return __r; \
213 }
214
215// Implements texture fetch instructions for int4/uint4/float4 data types.
216#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
217 __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
218 __ASM_OUT("r"), __asm_args) \
219 __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
220 __ASM_OUT("r"), __asm_args) \
221 __IMPL_F1(float4, float4, __args, \
222 __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \
223 __asm_args)
224// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
225// types. Similar to above, but returns a boolean 'isPresent' value in addition
226// to texture data,
227#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
228 __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
229 __ASM_OUTP("r"), __asm_args) \
230 __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
231 __ASM_OUTP("r"), __asm_args) \
232 __IMPL_F1(float4, float4, __args, \
233 __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \
234 __asm_args)
235
236// Similar to F3, but for integer data which is returned as normalized floats.
237// Only instantiates fetch functions for int4/uint4.
238#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
239 __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
240 __ASM_OUT("r"), __asm_args) \
241 __IMPL_F1(float4, uint4, __args, \
242 __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \
243 __asm_args)
244
245// Instantiates __tex_fetch_v4 with regular fetch functions.
246#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
247 template <> struct __tex_fetch_v4<__op> { \
248 template <class T> \
249 __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
250 __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
251 }
252
253// Same, but for sparse ops. Only available on sm_60+
254#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
255#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \
256 __asm_args) \
257 template <> struct __tex_fetch_v4<__op> { \
258 template <class T> \
259 __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
260 __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
261 }
262#else
263#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
264#endif
265
266// Same, but for normalized float ops.
267#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \
268 __asm_args) \
269 template <> struct __tex_fetch_v4<__op> { \
270 template <class T> \
271 __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \
272 __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
273 }
274
275// Regular and normalized float ops share a lot of similarities. This macro
276// instantiates both variants -- normal for __op and normalized for __opn.
277#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
278 __asm_args) \
279 __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \
280 __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
281
282// Convenience macros which converts string literal __op into a __Tag,
283#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
284 __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
285#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
286 __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
287#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
288 __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
289#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
290 __asm_args) \
291 __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
292 __asm_args)
293
294// CUDA headers have some 'legacy' texture oprerations that duplicate
295// functionality. So, we just inherit it, instead of refining a copy.
296#define __IMPL_ALIASI(__op, __opn) \
297 template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
298#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
299
300// Now we can instantiate everything we need for each specific texture fetch
301// variant.
302__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
303 "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
304__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
305 "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
306__IMPL_ALIAS("__itex1D", "__tex1D_v2");
307__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
308
309__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
310 (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
311 "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
312 ("f"(__x), "f"(__dPdx), "f"(__dPdy)));
313__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
314
315__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
316 (float __x, int __layer), "tex.a1d.v4", "f32",
317 "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
318__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
319
320__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
321 (float __x, int __layer, float __dPdx, float __dPdy),
322 "tex.grad.a1d.v4", "f32",
323 "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
324 ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
325__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
326
327__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
328 (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
329 "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
330 ("r"(__layer), "f"(__x), "f"(__level)));
331__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
332
333__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
334 "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
335 ("f"(__x), "f"(__level)));
336__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
337
338// 2D
339__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
340 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
341__IMPL_ALIAS("__itex2D", "__tex2D_v2");
342
343__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
344 "{.reg .pred %%p0;\n\t"
345 "tex.2d.v4",
346 "f32",
347 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
348 " selp.u16 %4, 1, 0, %%p0; }",
349 ("f"(__x), "f"(__y)));
350
351__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
352 (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
353 "tex.grad.2d.v4", "f32",
354 "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
355 ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
356 "f"(__dPdy->y)));
357__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
358
359__IMPL_S3S("__itex2DGrad_sparse",
360 (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
361 unsigned char *__ir),
362 "{.reg .pred %%p0;\n\t"
363 "tex.grad.2d.v4",
364 "f32",
365 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
366 "selp.u16 %4, 1, 0, %%p0; }",
367 ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
368 "f"(__dPdy->y)));
369
370__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
371 (float __x, float __y, int __layer), "tex.a2d.v4", "f32",
372 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
373 ("r"(__layer), "f"(__x), "f"(__y)));
374__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
375
376__IMPL_S3S("__itex2DLayered_sparse",
377 (float __x, float __y, int __layer, unsigned char *__ir),
378 "{.reg .pred %%p0;\n\t"
379 "tex.a2d.v4",
380 "f32",
381 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
382 "selp.u16 %4, 1, 0, %%p0; }",
383 ("r"(__layer), "f"(__x), "f"(__y)));
384
385__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
386 (float __x, float __y, int __layer, const float2 *__dPdx,
387 const float2 *__dPdy),
388 "tex.grad.a2d.v4", "f32",
389 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
390 ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
391 "f"(__dPdy->x), "f"(__dPdy->y)));
392__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
393
395 "__itex2DLayeredGrad_sparse",
396 (float __x, float __y, int __layer, const float2 *__dPdx,
397 const float2 *__dPdy, unsigned char *__ir),
398 "{.reg .pred %%p0;\n\t"
399 "tex.grad.a2d.v4",
400 "f32",
401 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
402 "selp.u16 %4, 1, 0, %%p0; }",
403 ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
404 "f"(__dPdy->x), "f"(__dPdy->y)));
405
406__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
407 (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
408 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
409 ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
410__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
411
412__IMPL_S3S("__itex2DLayeredLod_sparse",
413 (float __x, float __y, int __layer, float __level,
414 unsigned char *__ir),
415 "{.reg .pred %%p0;\n\t"
416 "tex.level.a2d.v4",
417 "f32",
418 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
419 "selp.u16 %4, 1, 0, %%p0; }",
420 ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
421
422__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
423 (float __x, float __y, float __level), "tex.level.2d.v4", "f32",
424 "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
425 ("f"(__x), "f"(__y), "f"(__level)));
426__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
427
428__IMPL_S3S("__itex2DLod_sparse",
429 (float __x, float __y, float __level, unsigned char *__ir),
430 "{.reg .pred %%p0;\n\t"
431 "tex.level.2d.v4",
432 "f32",
433 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
434 "selp.u16 %4, 1, 0, %%p0; }",
435 ("f"(__x), "f"(__y), "f"(__level)));
436
437// 2D gather is special. Unlike other variants that translate into exactly one
438// asm instruction, it uses one of the four different instructions selected by
439// __comp. We implement each instruction variant separately, and dispatch the
440// right one from the manually implemented 'umbrella' fetch.
441#define __IMPL_2DGATHER(variant, instr) \
442 __IMPL_SI(__IDV("__tex2Dgather_v2", variant), \
443 __IDV("__tex2Dgather_rmnf_v2", variant), \
444 (float __x, float __y, int __comp), instr, "f32", \
445 "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \
446 __IMPL_ALIASI(__IDV("__itex2Dgather", variant), \
447 __IDV("__tex2Dgather_v2", variant)); \
448 __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \
449 (float __x, float __y, unsigned char *__ir, int __comp), \
450 "{.reg .pred %%p0;\n\t" instr, "f32", \
451 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" \
452 "selp.u16 %4, 1, 0, %%p0; }", \
453 ("f"(__x), "f"(__y)));
454__IMPL_2DGATHER(0, "tld4.r.2d.v4");
455__IMPL_2DGATHER(1, "tld4.g.2d.v4");
456__IMPL_2DGATHER(2, "tld4.b.2d.v4");
457__IMPL_2DGATHER(3, "tld4.a.2d.v4");
458
459// Umbrella dispatcher -- calls into specific 2Dgather variant.
460template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
461 template <class __T>
462 __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
463 int __comp) {
464 switch (__comp) {
465 case 0:
466 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
467 __obj, __x, __y, __comp);
468 case 1:
469 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
470 __obj, __x, __y, __comp);
471 case 2:
472 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
473 __obj, __x, __y, __comp);
474 case 3:
475 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
476 __obj, __x, __y, __comp);
477 }
478 }
479};
480__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
481
482template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
483 template <class __T>
484 __device__ static float4 __run(cudaTextureObject_t __obj, float __x,
485 float __y, int __comp) {
486 switch (__comp) {
487 case 0:
488 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
489 __obj, __x, __y, __comp);
490 case 1:
491 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
492 __obj, __x, __y, __comp);
493 case 2:
494 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
495 __obj, __x, __y, __comp);
496 case 3:
497 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
498 __obj, __x, __y, __comp);
499 }
500 }
501};
502
503#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
504template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
505 template <class __T>
506 __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
507 unsigned char *__ir, int __comp) {
508 switch (__comp) {
509 case 0:
510 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
511 __obj, __x, __y, __ir, __comp);
512 case 1:
513 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
514 __obj, __x, __y, __ir, __comp);
515 case 2:
516 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
517 __obj, __x, __y, __ir, __comp);
518 case 3:
519 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
520 __obj, __x, __y, __ir, __comp);
521 }
522 }
523};
524#endif
525
526// 3D
527__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
528 "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
529 ("f"(__x), "f"(__y), "f"(__z)));
530__IMPL_ALIAS("__itex3D", "__tex3D_v2");
531
532__IMPL_S3S("__itex3D_sparse",
533 (float __x, float __y, float __z, unsigned char *__ir),
534 "{.reg .pred %%p0;\n\t"
535 "tex.3d.v4",
536 "f32",
537 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
538 "selp.u16 %4, 1, 0, %%p0; }",
539 ("f"(__x), "f"(__y), "f"(__z)));
540
541__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
542 (float __x, float __y, float __z, const float4 *__dPdx,
543 const float4 *__dPdy),
544 "tex.grad.3d.v4", "f32",
545 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
546 "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
547 ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
548 "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
549__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
550
551__IMPL_S3S("__itex3DGrad_sparse",
552 (float __x, float __y, float __z, const float4 *__dPdx,
553 const float4 *__dPdy, unsigned char *__ir),
554 "{.reg .pred %%p0;\n\t"
555 "tex.grad.3d.v4",
556 "f32",
557 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
558 "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
559 "selp.u16 %4, 1, 0, %%p0; }",
560 ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
561 "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
562
563__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
564 (float __x, float __y, float __z, float __level), "tex.level.3d.v4",
565 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
566 ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
567__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
568
569__IMPL_S3S("__itex3DLod_sparse",
570 (float __x, float __y, float __z, float __level,
571 unsigned char *__ir),
572 "{.reg .pred %%p0;\n\t"
573 "tex.level.3d.v4",
574 "f32",
575 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
576 "selp.u16 %4, 1, 0, %%p0; }",
577 ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
578
579// Cubemap
580__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
581 (float __x, float __y, float __z), "tex.cube.v4", "f32",
582 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
583 ("f"(__x), "f"(__y), "f"(__z)));
584__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
585
586__IMPL_S3S("__itexCubemap_sparse",
587 (float __x, float __y, float __z, unsigned char *__ir),
588 "{.reg .pred %%p0;\n\t"
589 "tex.cube.v4",
590 "f32",
591 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
592 "selp.u16 %4, 1, 0, %%p0; }",
593 ("f"(__x), "f"(__y), "f"(__z)));
594
595__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
596 (float __x, float __y, float __z, const float4 *__dPdx,
597 const float4 *__dPdy),
598 "tex.grad.cube.v4", "f32",
599 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
600 "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
601 ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
602 "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
603__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
604
605__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
606 (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
607 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
608 ("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
609__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
610
611__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
612 (float __x, float __y, float __z, int __layer, const float4 *__dPdx,
613 const float4 *__dPdy),
614 "tex.grad.acube.v4", "f32",
615 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
616 "{%9, %10, %11, %11}, {%12, %13, %14, %14};",
617 ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
618 "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
619 "f"(__dPdy->z)));
620__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
621
622__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
623 (float __x, float __y, float __z, int __layer, float __level),
624 "tex.level.acube.v4", "f32",
625 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
626 ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
627__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
628
629__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
630 (float __x, float __y, float __z, float __level), "tex.level.cube.v4",
631 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
632 ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
633__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
634
635// Helper class for extracting slice of data from V4 fetch results.
636template <class __DestT, class __SrcT> struct __convert {
637 template <int __NElements = sizeof(__DestT) /
638 sizeof(typename __TypeInfoT<__DestT>::__base_t)>
639 __device__ static __DestT __run(__SrcT __v);
640 template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
641 template <> __device__ static __DestT __run<2>(__SrcT __v) {
642 return {__v.x, __v.y};
643 }
644 template <> __device__ static __DestT __run<3>(__SrcT __v) {
645 return {__v.x, __v.y, __v.z};
646 }
647 template <> __device__ static __DestT __run<4>(__SrcT __v) {
648 return {__v.x, __v.y, __v.z, __v.w};
649 }
650};
651
652// These are the top-level function overloads the __nv_tex_surf_handler expands
653// to. Each overload deals with one of the several ways __nv_tex_surf_handler
654// is called by CUDA headers. In the end, each of the overloads does the same
655// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
656// fetch texture data and which `__convert::run` is needed to convert it into
657// appropriate return type.
658
659// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
660// Data type and return type are based on ret.
661template <class __op, class __T, class... __Args>
662__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
663 __Args... __args) {
664 using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
665 *__ptr = __convert<__T, __FetchT>::__run(
666 __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
667}
668
669#if CUDA_VERSION < 12000
670// texture<> objects get magically converted into a texture reference. However,
671// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
672// cheat a bit and use inline assembly to do it. It costs us an extra register
673// and a move, but that is easy for ptxas to optimize away.
674template <class __T>
675__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
676 cudaTextureObject_t __obj;
677 asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
678 return __obj;
679}
680
681// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
682// Data type and return type is based on ret.
683template <class __op, class __T, class __HandleT, class... __Args>
684__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
685 __Args... __args) {
686 using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
687 *__ptr = __convert<__T, __FetchT>::__run(
688 __tex_fetch_v4<__op>::template __run<__FetchT>(
689 __tex_handle_to_obj(__handle), __args...));
690}
691
692// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
693// cudaReadModeNormalizedFloat fetches always return float4.
694template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
695__device__ static void
696__tex_fetch(__DataT *, __RetT *__ptr,
697 texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
698 __Args... __args) {
699 using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
700 *__ptr = __convert<__RetT, float4>::__run(
701 __tex_fetch_v4<__op>::template __run<__FetchT>(
702 __tex_handle_to_obj(__handle), __args...));
703}
704
705// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
706// For cudaReadModeElementType fetch return type is based on type_dummy.
707template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
708__device__ static void
709__tex_fetch(__DataT *, __RetT *__ptr,
710 texture<__DataT, __TexT, cudaReadModeElementType> __handle,
711 __Args... __args) {
712 using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
713 *__ptr = __convert<__RetT, __FetchT>::__run(
714 __tex_fetch_v4<__op>::template __run<__FetchT>(
715 __tex_handle_to_obj(__handle), __args...));
716}
717#endif // CUDA_VERSION
718} // namespace __cuda_tex
719} // namespace
720#pragma pop_macro("__ASM_OUT")
721#pragma pop_macro("__ASM_OUTP")
722#pragma pop_macro("__Args")
723#pragma pop_macro("__ID")
724#pragma pop_macro("__IDV")
725#pragma pop_macro("__IMPL_2DGATHER")
726#pragma pop_macro("__IMPL_ALIAS")
727#pragma pop_macro("__IMPL_ALIASI")
728#pragma pop_macro("__IMPL_F1")
729#pragma pop_macro("__IMPL_F3")
730#pragma pop_macro("__IMPL_F3N")
731#pragma pop_macro("__IMPL_F3S")
732#pragma pop_macro("__IMPL_S")
733#pragma pop_macro("__IMPL_S3")
734#pragma pop_macro("__IMPL_S3I")
735#pragma pop_macro("__IMPL_S3N")
736#pragma pop_macro("__IMPL_S3NI")
737#pragma pop_macro("__IMPL_S3S")
738#pragma pop_macro("__IMPL_S3SI")
739#pragma pop_macro("__IMPL_SI")
740#pragma pop_macro("__L")
741#pragma pop_macro("__STRIP_PARENS")
742#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__
#define __ID(__op)
#define __Args(...)
#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#define __IMPL_ALIAS(__op, __opn)
#define __IMPL_2DGATHER(variant, instr)
#define __IDV(__op, __variant)
#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
__device__ int
__device__ __2f16 float __ockl_bool s
__device__ __2f16 float c
__device__ float
#define __device__
static __inline__ uint32_t uint32_t __y
Definition: arm_acle.h:130
struct __storeu_i16 *__P __v
Definition: immintrin.h:472
vector< float, 4 > float4
vector< int, 4 > int4
vector< uint, 4 > uint4