clang 22.0.0git
avxifmaintrin.h
Go to the documentation of this file.
1/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXIFMAINTRIN_H
15#define __AVXIFMAINTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#if defined(__cplusplus) && (__cplusplus >= 201103L)
19#define __DEFAULT_FN_ATTRS128 \
20 __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
21 __min_vector_width__(128))) constexpr
22#define __DEFAULT_FN_ATTRS256 \
23 __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
24 __min_vector_width__(256))) constexpr
25#else
26#define __DEFAULT_FN_ATTRS128 \
27 __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
28 __min_vector_width__(128)))
29#define __DEFAULT_FN_ATTRS256 \
30 __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
31 __min_vector_width__(256)))
32#endif
33
34#if !defined(__AVX512IFMA__) && defined(__AVXIFMA__)
35#define _mm_madd52hi_epu64(X, Y, Z) _mm_madd52hi_avx_epu64(X, Y, Z)
36#define _mm_madd52lo_epu64(X, Y, Z) _mm_madd52lo_avx_epu64(X, Y, Z)
37#define _mm256_madd52hi_epu64(X, Y, Z) _mm256_madd52hi_avx_epu64(X, Y, Z)
38#define _mm256_madd52lo_epu64(X, Y, Z) _mm256_madd52lo_avx_epu64(X, Y, Z)
39#endif
40
41// must vex-encoding
42
43/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
44/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
45/// unsigned integer from the intermediate result with the corresponding
46/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
47///
48/// \headerfile <immintrin.h>
49///
50/// \code
51/// __m128i
52/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
53/// \endcode
54///
55/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
56///
57/// \return
58/// return __m128i dst.
59/// \param __X
60/// A 128-bit vector of [2 x i64]
61/// \param __Y
62/// A 128-bit vector of [2 x i64]
63/// \param __Z
64/// A 128-bit vector of [2 x i64]
65///
66/// \code{.operation}
67/// FOR j := 0 to 1
68/// i := j*64
69/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
70/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
71/// ENDFOR
72/// dst[MAX:128] := 0
73/// \endcode
74static __inline__ __m128i __DEFAULT_FN_ATTRS128
75_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
76 return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
77 (__v2di)__Z);
78}
79
80/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
81/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
82/// unsigned integer from the intermediate result with the corresponding
83/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
84///
85/// \headerfile <immintrin.h>
86///
87/// \code
88/// __m256i
89/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
90/// \endcode
91///
92/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
93///
94/// \return
95/// return __m256i dst.
96/// \param __X
97/// A 256-bit vector of [4 x i64]
98/// \param __Y
99/// A 256-bit vector of [4 x i64]
100/// \param __Z
101/// A 256-bit vector of [4 x i64]
102///
103/// \code{.operation}
104/// FOR j := 0 to 3
105/// i := j*64
106/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
107/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
108/// ENDFOR
109/// dst[MAX:256] := 0
110/// \endcode
111static __inline__ __m256i __DEFAULT_FN_ATTRS256
112_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
113 return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
114 (__v4di)__Z);
115}
116
117/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
118/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
119/// unsigned integer from the intermediate result with the corresponding
120/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
121///
122/// \headerfile <immintrin.h>
123///
124/// \code
125/// __m128i
126/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
127/// \endcode
128///
129/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
130///
131/// \return
132/// return __m128i dst.
133/// \param __X
134/// A 128-bit vector of [2 x i64]
135/// \param __Y
136/// A 128-bit vector of [2 x i64]
137/// \param __Z
138/// A 128-bit vector of [2 x i64]
139///
140/// \code{.operation}
141/// FOR j := 0 to 1
142/// i := j*64
143/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
144/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
145/// ENDFOR
146/// dst[MAX:128] := 0
147/// \endcode
148static __inline__ __m128i __DEFAULT_FN_ATTRS128
149_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
150 return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
151 (__v2di)__Z);
152}
153
154/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
155/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
156/// unsigned integer from the intermediate result with the corresponding
157/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
158///
159/// \headerfile <immintrin.h>
160///
161/// \code
162/// __m256i
163/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
164/// \endcode
165///
166/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
167///
168/// \return
169/// return __m256i dst.
170/// \param __X
171/// A 256-bit vector of [4 x i64]
172/// \param __Y
173/// A 256-bit vector of [4 x i64]
174/// \param __Z
175/// A 256-bit vector of [4 x i64]
176///
177/// \code{.operation}
178/// FOR j := 0 to 3
179/// i := j*64
180/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
181/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
182/// ENDFOR
183/// dst[MAX:256] := 0
184/// \endcode
185static __inline__ __m256i __DEFAULT_FN_ATTRS256
186_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
187 return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
188 (__v4di)__Z);
189}
190#undef __DEFAULT_FN_ATTRS128
191#undef __DEFAULT_FN_ATTRS256
192
193#endif // __AVXIFMAINTRIN_H
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z)
Multiply packed unsigned 52-bit integers in each 64-bit element of __Y and __Z to form a 104-bit inte...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z)
Multiply packed unsigned 52-bit integers in each 64-bit element of __Y and __Z to form a 104-bit inte...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z)
Multiply packed unsigned 52-bit integers in each 64-bit element of __Y and __Z to form a 104-bit inte...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z)
Multiply packed unsigned 52-bit integers in each 64-bit element of __Y and __Z to form a 104-bit inte...
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19