clang 23.0.0git
avx512vlvnniintrin.h
Go to the documentation of this file.
1/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
2 *
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *
8 *===-----------------------------------------------------------------------===
9 */
10#ifndef __IMMINTRIN_H
11#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX512VLVNNIINTRIN_H
15#define __AVX512VLVNNIINTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#if defined(__cplusplus) && (__cplusplus >= 201103L)
19#define __DEFAULT_FN_ATTRS128 \
20 __attribute__((__always_inline__, __nodebug__, \
21 __target__("avx512vl,avx512vnni"), \
22 __min_vector_width__(128))) constexpr
23#define __DEFAULT_FN_ATTRS256 \
24 __attribute__((__always_inline__, __nodebug__, \
25 __target__("avx512vl,avx512vnni"), \
26 __min_vector_width__(256))) constexpr
27#else
28#define __DEFAULT_FN_ATTRS128 \
29 __attribute__((__always_inline__, __nodebug__, \
30 __target__("avx512vl,avx512vnni"), \
31 __min_vector_width__(128)))
32#define __DEFAULT_FN_ATTRS256 \
33 __attribute__((__always_inline__, __nodebug__, \
34 __target__("avx512vl,avx512vnni"), \
35 __min_vector_width__(256)))
36#endif
37
38/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
39/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
40/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
41/// in \a S, and store the packed 32-bit results in DST.
42///
43/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
44///
45/// \code{.operation}
46/// FOR j := 0 to 7
47/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
48/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
49/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
50/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
51/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
52/// ENDFOR
53/// DST[MAX:256] := 0
54/// \endcode
55#define _mm256_dpbusd_epi32(S, A, B) \
56 ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v32qu)(A), (__v32qi)(B)))
57
58/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
59/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
60/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
61/// in \a S using signed saturation, and store the packed 32-bit results in DST.
62///
63/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
64///
65/// \code{.operation}
66/// FOR j := 0 to 7
67/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
68/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
69/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
70/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
71/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
72/// ENDFOR
73/// DST[MAX:256] := 0
74/// \endcode
75#define _mm256_dpbusds_epi32(S, A, B) \
76 ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v32qu)(A), \
77 (__v32qi)(B)))
78
79/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
80/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
81/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
82/// and store the packed 32-bit results in DST.
83///
84/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
85///
86/// \code{.operation}
87/// FOR j := 0 to 7
88/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
89/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
90/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
91/// ENDFOR
92/// DST[MAX:256] := 0
93/// \endcode
94#define _mm256_dpwssd_epi32(S, A, B) \
95 ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v16hi)(A), (__v16hi)(B)))
96
97/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
98/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
99/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
100/// using signed saturation, and store the packed 32-bit results in DST.
101///
102/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
103///
104/// \code{.operation}
105/// FOR j := 0 to 7
106/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
107/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
108/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
109/// ENDFOR
110/// DST[MAX:256] := 0
111/// \endcode
112#define _mm256_dpwssds_epi32(S, A, B) \
113 ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v16hi)(A), \
114 (__v16hi)(B)))
115
116/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
117/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
118/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
119/// in \a S, and store the packed 32-bit results in DST.
120///
121/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
122///
123/// \code{.operation}
124/// FOR j := 0 to 3
125/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
126/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
127/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
128/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
129/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
130/// ENDFOR
131/// DST[MAX:128] := 0
132/// \endcode
133#define _mm_dpbusd_epi32(S, A, B) \
134 ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v16qu)(A), (__v16qi)(B)))
135
136/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
137/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
138/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
139/// in \a S using signed saturation, and store the packed 32-bit results in DST.
140///
141/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
142///
143/// \code{.operation}
144/// FOR j := 0 to 3
145/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
146/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
147/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
148/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
149/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
150/// ENDFOR
151/// DST[MAX:128] := 0
152/// \endcode
153#define _mm_dpbusds_epi32(S, A, B) \
154 ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v16qu)(A), \
155 (__v16qi)(B)))
156
157/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
158/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
159/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
160/// and store the packed 32-bit results in DST.
161///
162/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
163///
164/// \code{.operation}
165/// FOR j := 0 to 3
166/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
167/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
168/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
169/// ENDFOR
170/// DST[MAX:128] := 0
171/// \endcode
172#define _mm_dpwssd_epi32(S, A, B) \
173 ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
174
175/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
176/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
177/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
178/// using signed saturation, and store the packed 32-bit results in DST.
179///
180/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
181///
182/// \code{.operation}
183/// FOR j := 0 to 3
184/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
185/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
186/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
187/// ENDFOR
188/// DST[MAX:128] := 0
189/// \endcode
190#define _mm_dpwssds_epi32(S, A, B) \
191 ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
192
193static __inline__ __m256i __DEFAULT_FN_ATTRS256
194_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
195 return (__m256i)__builtin_ia32_selectd_256(__U,
196 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
197 (__v8si)__S);
198}
199
200static __inline__ __m256i __DEFAULT_FN_ATTRS256
201_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
202 return (__m256i)__builtin_ia32_selectd_256(__U,
203 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
204 (__v8si)_mm256_setzero_si256());
205}
206
207static __inline__ __m256i __DEFAULT_FN_ATTRS256
208_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
209 return (__m256i)__builtin_ia32_selectd_256(__U,
210 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
211 (__v8si)__S);
212}
213
215 __mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
216 return (__m256i)__builtin_ia32_selectd_256(__U,
217 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
218 (__v8si)_mm256_setzero_si256());
219}
220
221static __inline__ __m256i __DEFAULT_FN_ATTRS256
222_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
223 return (__m256i)__builtin_ia32_selectd_256(__U,
224 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
225 (__v8si)__S);
226}
227
228static __inline__ __m256i __DEFAULT_FN_ATTRS256
229_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
230 return (__m256i)__builtin_ia32_selectd_256(__U,
231 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
232 (__v8si)_mm256_setzero_si256());
233}
234
235static __inline__ __m256i __DEFAULT_FN_ATTRS256
236_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
237 return (__m256i)__builtin_ia32_selectd_256(__U,
238 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
239 (__v8si)__S);
240}
241
243 __mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
244 return (__m256i)__builtin_ia32_selectd_256(__U,
245 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
246 (__v8si)_mm256_setzero_si256());
247}
248
249static __inline__ __m128i __DEFAULT_FN_ATTRS128
250_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
251 return (__m128i)__builtin_ia32_selectd_128(__U,
252 (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
253 (__v4si)__S);
254}
255
256static __inline__ __m128i __DEFAULT_FN_ATTRS128
257_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
258 return (__m128i)__builtin_ia32_selectd_128(__U,
259 (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
260 (__v4si)_mm_setzero_si128());
261}
262
263static __inline__ __m128i __DEFAULT_FN_ATTRS128
264_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
265 return (__m128i)__builtin_ia32_selectd_128(__U,
266 (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
267 (__v4si)__S);
268}
269
270static __inline__ __m128i __DEFAULT_FN_ATTRS128
271_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
272 return (__m128i)__builtin_ia32_selectd_128(__U,
273 (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
274 (__v4si)_mm_setzero_si128());
275}
276
277static __inline__ __m128i __DEFAULT_FN_ATTRS128
278_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
279 return (__m128i)__builtin_ia32_selectd_128(__U,
280 (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
281 (__v4si)__S);
282}
283
284static __inline__ __m128i __DEFAULT_FN_ATTRS128
285_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
286 return (__m128i)__builtin_ia32_selectd_128(__U,
287 (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
288 (__v4si)_mm_setzero_si128());
289}
290
291static __inline__ __m128i __DEFAULT_FN_ATTRS128
292_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
293 return (__m128i)__builtin_ia32_selectd_128(__U,
294 (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
295 (__v4si)__S);
296}
297
298static __inline__ __m128i __DEFAULT_FN_ATTRS128
299_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
300 return (__m128i)__builtin_ia32_selectd_128(__U,
301 (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
302 (__v4si)_mm_setzero_si128());
303}
304
305#undef __DEFAULT_FN_ATTRS128
306#undef __DEFAULT_FN_ATTRS256
307
308#endif
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
unsigned char __mmask8
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
#define _mm_dpbusd_epi32(S, A, B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in A with corresponding signed 8-bit i...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
#define _mm256_dpbusds_epi32(S, A, B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in A with corresponding signed 8-bit i...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
#define _mm_dpwssds_epi32(S, A, B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in A with corresponding 16-bit integers...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
#define _mm256_dpwssds_epi32(S, A, B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in A with corresponding 16-bit integers...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
#define _mm256_dpwssd_epi32(S, A, B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in A with corresponding 16-bit integers...
#define _mm_dpbusds_epi32(S, A, B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in A with corresponding signed 8-bit i...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
#define _mm256_dpbusd_epi32(S, A, B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in A with corresponding signed 8-bit i...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
#define _mm_dpwssd_epi32(S, A, B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in A with corresponding 16-bit integers...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition avxintrin.h:4299
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3878