clang 20.0.0git
amxcomplexintrin.h
Go to the documentation of this file.
1/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===------------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
12#endif // __IMMINTRIN_H
13
14#ifndef __AMX_COMPLEXINTRIN_H
15#define __AMX_COMPLEXINTRIN_H
16#ifdef __x86_64__
17
18#define __DEFAULT_FN_ATTRS_COMPLEX \
19 __attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
20
21/// Perform matrix multiplication of two tiles containing complex elements and
22/// accumulate the results into a packed single precision tile. Each dword
23/// element in input tiles \a a and \a b is interpreted as a complex number
24/// with FP16 real part and FP16 imaginary part.
25/// Calculates the imaginary part of the result. For each possible combination
26/// of (row of \a a, column of \a b), it performs a set of multiplication
27/// and accumulations on all corresponding complex numbers (one from \a a
28/// and one from \a b). The imaginary part of the \a a element is multiplied
29/// with the real part of the corresponding \a b element, and the real part
30/// of the \a a element is multiplied with the imaginary part of the
31/// corresponding \a b elements. The two accumulated results are added, and
32/// then accumulated into the corresponding row and column of \a dst.
33///
34/// \headerfile <x86intrin.h>
35///
36/// \code
37/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
38/// \endcode
39///
40/// \code{.operation}
41/// FOR m := 0 TO dst.rows - 1
42/// tmp := dst.row[m]
43/// FOR k := 0 TO (a.colsb / 4) - 1
44/// FOR n := 0 TO (dst.colsb / 4) - 1
45/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
46/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
47/// ENDFOR
48/// ENDFOR
49/// write_row_and_zero(dst, m, tmp, dst.colsb)
50/// ENDFOR
51/// zero_upper_rows(dst, dst.rows)
52/// zero_tileconfig_start()
53/// \endcode
54///
55/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
56///
57/// \param dst
58/// The destination tile. Max size is 1024 Bytes.
59/// \param a
60/// The 1st source tile. Max size is 1024 Bytes.
61/// \param b
62/// The 2nd source tile. Max size is 1024 Bytes.
63#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
64
65/// Perform matrix multiplication of two tiles containing complex elements and
66/// accumulate the results into a packed single precision tile. Each dword
67/// element in input tiles \a a and \a b is interpreted as a complex number
68/// with FP16 real part and FP16 imaginary part.
69/// Calculates the real part of the result. For each possible combination
70/// of (row of \a a, column of \a b), it performs a set of multiplication
71/// and accumulations on all corresponding complex numbers (one from \a a
72/// and one from \a b). The real part of the \a a element is multiplied
73/// with the real part of the corresponding \a b element, and the negated
74/// imaginary part of the \a a element is multiplied with the imaginary
75/// part of the corresponding \a b elements. The two accumulated results
76/// are added, and then accumulated into the corresponding row and column
77/// of \a dst.
78///
79/// \headerfile <x86intrin.h>
80///
81/// \code
82/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
83/// \endcode
84///
85/// \code{.operation}
86/// FOR m := 0 TO dst.rows - 1
87/// tmp := dst.row[m]
88/// FOR k := 0 TO (a.colsb / 4) - 1
89/// FOR n := 0 TO (dst.colsb / 4) - 1
90/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
91/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
92/// ENDFOR
93/// ENDFOR
94/// write_row_and_zero(dst, m, tmp, dst.colsb)
95/// ENDFOR
96/// zero_upper_rows(dst, dst.rows)
97/// zero_tileconfig_start()
98/// \endcode
99///
100/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
101///
102/// \param dst
103/// The destination tile. Max size is 1024 Bytes.
104/// \param a
105/// The 1st source tile. Max size is 1024 Bytes.
106/// \param b
107/// The 2nd source tile. Max size is 1024 Bytes.
108#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
109
110static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
111_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
112 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
113 return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
114}
115
116static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
117_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
118 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
119 return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
120}
121
122/// Perform matrix multiplication of two tiles containing complex elements and
123/// accumulate the results into a packed single precision tile. Each dword
124/// element in input tiles src0 and src1 is interpreted as a complex number with
125/// FP16 real part and FP16 imaginary part.
126/// This function calculates the imaginary part of the result.
127///
128/// \headerfile <immintrin.h>
129///
130/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
131///
132/// \param dst
133/// The destination tile. Max size is 1024 Bytes.
134/// \param src0
135/// The 1st source tile. Max size is 1024 Bytes.
136/// \param src1
137/// The 2nd source tile. Max size is 1024 Bytes.
138__DEFAULT_FN_ATTRS_COMPLEX
139static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
140 __tile1024i src1) {
141 dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
142 dst->tile, src0.tile, src1.tile);
143}
144
145/// Perform matrix multiplication of two tiles containing complex elements and
146/// accumulate the results into a packed single precision tile. Each dword
147/// element in input tiles src0 and src1 is interpreted as a complex number with
148/// FP16 real part and FP16 imaginary part.
149/// This function calculates the real part of the result.
150///
151/// \headerfile <immintrin.h>
152///
153/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
154///
155/// \param dst
156/// The destination tile. Max size is 1024 Bytes.
157/// \param src0
158/// The 1st source tile. Max size is 1024 Bytes.
159/// \param src1
160/// The 2nd source tile. Max size is 1024 Bytes.
161__DEFAULT_FN_ATTRS_COMPLEX
162static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
163 __tile1024i src1) {
164 dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
165 dst->tile, src0.tile, src1.tile);
166}
167
168#endif // __x86_64__
169#endif // __AMX_COMPLEXINTRIN_H