clang 22.0.0git
avxvnniint16intrin.h
Go to the documentation of this file.
1/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error \
12 "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13#endif // __IMMINTRIN_H
14
15#ifndef __AVXVNNIINT16INTRIN_H
16#define __AVXVNNIINT16INTRIN_H
17
18/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
19/// corresponding unsigned 16-bit integers in \a __B, producing 2
20/// intermediate signed 16-bit results. Sum these 2 results with the
21/// corresponding 32-bit integer in \a __W, and store the packed 32-bit
22/// results in \a dst.
23///
24/// \headerfile <immintrin.h>
25///
26/// \code
27/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
28/// \endcode
29///
30/// This intrinsic corresponds to the \c VPDPWSUD instruction.
31///
32/// \param __W
33/// A 128-bit vector of [4 x int].
34/// \param __A
35/// A 128-bit vector of [8 x short].
36/// \param __B
37/// A 128-bit vector of [8 x unsigned short].
38/// \returns
39/// A 128-bit vector of [4 x int].
40///
41/// \code{.operation}
42/// FOR j := 0 to 3
43/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
44/// tmp2.dword :=
45/// SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
46/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
47/// ENDFOR
48/// dst[MAX:128] := 0
49/// \endcode
50#define _mm_dpwsud_epi32(__W, __A, __B) \
51 ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v8hi)(__A), \
52 (__v8hu)(__B)))
53
54/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
55/// corresponding unsigned 16-bit integers in \a __B, producing 2
56/// intermediate signed 16-bit results. Sum these 2 results with the
57/// corresponding 32-bit integer in \a __W, and store the packed 32-bit
58/// results in \a dst.
59///
60/// \headerfile <immintrin.h>
61///
62/// \code
63/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
64/// \endcode
65///
66/// This intrinsic corresponds to the \c VPDPWSUD instruction.
67///
68/// \param __W
69/// A 256-bit vector of [8 x int].
70/// \param __A
71/// A 256-bit vector of [16 x short].
72/// \param __B
73/// A 256-bit vector of [16 x unsigned short].
74/// \returns
75/// A 256-bit vector of [8 x int].
76///
77/// \code{.operation}
78/// FOR j := 0 to 7
79/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
80/// tmp2.dword :=
81/// SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
82/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
83/// ENDFOR
84/// dst[MAX:256] := 0
85/// \endcode
86#define _mm256_dpwsud_epi32(__W, __A, __B) \
87 ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v16hi)(__A), \
88 (__v16hu)(__B)))
89
90/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
91/// corresponding unsigned 16-bit integers in \a __B, producing 2
92/// intermediate signed 16-bit results. Sum these 2 results with the
93/// corresponding 32-bit integer in \a __W with signed saturation, and store
94/// the packed 32-bit results in \a dst.
95///
96/// \headerfile <immintrin.h>
97///
98/// \code
99/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
100/// \endcode
101///
102/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
103///
104/// \param __W
105/// A 128-bit vector of [4 x int].
106/// \param __A
107/// A 128-bit vector of [8 x short].
108/// \param __B
109/// A 128-bit vector of [8 x unsigned short].
110/// \returns
111/// A 128-bit vector of [4 x int].
112///
113/// \code{.operation}
114/// FOR j := 0 to 3
115/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
116/// tmp2.dword :=
117/// SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
118/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
119/// ENDFOR
120/// dst[MAX:128] := 0
121/// \endcode
122/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
123#define _mm_dpwsuds_epi32(__W, __A, __B) \
124 ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v8hi)(__A), \
125 (__v8hu)(__B)))
126
127/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
128/// corresponding unsigned 16-bit integers in \a __B, producing 2
129/// intermediate signed 16-bit results. Sum these 2 results with the
130/// corresponding 32-bit integer in \a __W with signed saturation, and store
131/// the packed 32-bit results in \a dst.
132///
133/// \headerfile <immintrin.h>
134///
135/// \code
136/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
137/// \endcode
138///
139/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
140///
141/// \param __W
142/// A 256-bit vector of [8 x int].
143/// \param __A
144/// A 256-bit vector of [16 x short].
145/// \param __B
146/// A 256-bit vector of [16 x unsigned short].
147/// \returns
148/// A 256-bit vector of [8 x int].
149///
150/// \code{.operation}
151/// FOR j := 0 to 7
152/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
153/// tmp2.dword :=
154/// SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
155/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
156/// ENDFOR
157/// dst[MAX:256] := 0
158/// \endcode
159#define _mm256_dpwsuds_epi32(__W, __A, __B) \
160 ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v16hi)(__A), \
161 (__v16hu)(__B)))
162
163/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
164/// with corresponding signed 16-bit integers in \a __B, producing 2
165/// intermediate signed 16-bit results. Sum these 2 results with the
166/// corresponding 32-bit integer in \a __W, and store the packed 32-bit
167/// results in \a dst.
168///
169/// \headerfile <immintrin.h>
170///
171/// \code
172/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
173/// \endcode
174///
175/// This intrinsic corresponds to the \c VPDPWUSD instruction.
176///
177/// \param __W
178/// A 128-bit vector of [4 x int].
179/// \param __A
180/// A 128-bit vector of [8 x unsigned short].
181/// \param __B
182/// A 128-bit vector of [8 x short].
183/// \returns
184/// A 128-bit vector of [4 x int].
185///
186/// \code{.operation}
187/// FOR j := 0 to 3
188/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
189/// tmp2.dword :=
190/// ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
191/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
192/// ENDFOR
193/// dst[MAX:128] := 0
194/// \endcode
195#define _mm_dpwusd_epi32(__W, __A, __B) \
196 ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v8hu)(__A), \
197 (__v8hi)(__B)))
198
199/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
200/// with corresponding signed 16-bit integers in \a __B, producing 2
201/// intermediate signed 16-bit results. Sum these 2 results with the
202/// corresponding 32-bit integer in \a __W, and store the packed 32-bit
203/// results in \a dst.
204///
205/// \headerfile <immintrin.h>
206///
207/// \code
208/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
209/// \endcode
210///
211/// This intrinsic corresponds to the \c VPDPWUSD instruction.
212///
213/// \param __W
214/// A 256-bit vector of [8 x int].
215/// \param __A
216/// A 256-bit vector of [16 x unsigned short].
217/// \param __B
218/// A 256-bit vector of [16 x short].
219/// \returns
220/// A 256-bit vector of [8 x int].
221///
222/// \code{.operation}
223/// FOR j := 0 to 7
224/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
225/// tmp2.dword :=
226/// ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
227/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
228/// ENDFOR
229/// dst[MAX:256] := 0
230/// \endcode
231#define _mm256_dpwusd_epi32(__W, __A, __B) \
232 ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v16hu)(__A), \
233 (__v16hi)(__B)))
234
235/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
236/// with corresponding signed 16-bit integers in \a __B, producing 2
237/// intermediate signed 16-bit results. Sum these 2 results with the
238/// corresponding 32-bit integer in \a __W with signed saturation, and
239/// store the packed 32-bit results in \a dst.
240///
241/// \headerfile <immintrin.h>
242///
243/// \code
244/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
245/// \endcode
246///
247/// This intrinsic corresponds to the \c VPDPWUSDS instruction.
248///
249/// \param __W
250/// A 128-bit vector of [4 x int].
251/// \param __A
252/// A 128-bit vector of [8 x unsigned short].
253/// \param __B
254/// A 128-bit vector of [8 x short].
255/// \returns
256/// A 128-bit vector of [4 x int].
257///
258/// \code{.operation}
259/// FOR j := 0 to 3
260/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
261/// tmp2.dword :=
262/// ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
263/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
264/// ENDFOR
265/// dst[MAX:128] := 0
266/// \endcode
267#define _mm_dpwusds_epi32(__W, __A, __B) \
268 ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v8hu)(__A), \
269 (__v8hi)(__B)))
270
271/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
272/// with corresponding signed 16-bit integers in \a __B, producing 2
273/// intermediate signed 16-bit results. Sum these 2 results with the
274/// corresponding 32-bit integer in \a __W with signed saturation, and
275/// store the packed 32-bit results in \a dst.
276///
277/// \headerfile <immintrin.h>
278///
279/// \code
280/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
281/// \endcode
282///
283/// This intrinsic corresponds to the \c VPDPWUSDS instruction.
284///
285/// \param __W
286/// A 256-bit vector of [8 x int].
287/// \param __A
288/// A 256-bit vector of [16 x unsigned short].
289/// \param __B
290/// A 256-bit vector of [16 x short].
291/// \returns
292/// A 256-bit vector of [8 x int].
293///
294/// \code{.operation}
295/// FOR j := 0 to 7
296/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
297/// tmp2.dword :=
298/// ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
299/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
300/// ENDFOR
301/// dst[MAX:256] := 0
302/// \endcode
303#define _mm256_dpwusds_epi32(__W, __A, __B) \
304 ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v16hu)(__A), \
305 (__v16hi)(__B)))
306
307/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
308/// with corresponding unsigned 16-bit integers in \a __B, producing 2
309/// intermediate signed 16-bit results. Sum these 2 results with the
310/// corresponding 32-bit integer in \a __W, and store the packed 32-bit
311/// results in \a dst.
312///
313/// \headerfile <immintrin.h>
314///
315/// \code
316/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
317/// \endcode
318///
319/// This intrinsic corresponds to the \c VPDPWUUD instruction.
320///
321/// \param __W
322/// A 128-bit vector of [4 x int].
323/// \param __A
324/// A 128-bit vector of [8 x unsigned short].
325/// \param __B
326/// A 128-bit vector of [8 x unsigned short].
327/// \returns
328/// A 128-bit vector of [4 x int].
329///
330/// \code{.operation}
331/// FOR j := 0 to 3
332/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
333/// tmp2.dword :=
334/// ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
335/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
336/// ENDFOR
337/// dst[MAX:128] := 0
338/// \endcode
339#define _mm_dpwuud_epi32(__W, __A, __B) \
340 ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v8hu)(__A), \
341 (__v8hu)(__B)))
342
343/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
344/// with corresponding unsigned 16-bit integers in \a __B, producing 2
345/// intermediate signed 16-bit results. Sum these 2 results with the
346/// corresponding 32-bit integer in \a __W, and store the packed 32-bit
347/// results in \a dst.
348///
349/// \headerfile <immintrin.h>
350///
351/// \code
352/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
353/// \endcode
354///
355/// This intrinsic corresponds to the \c VPDPWUUD instruction.
356///
357/// \param __W
358/// A 256-bit vector of [8 x int].
359/// \param __A
360/// A 256-bit vector of [16 x unsigned short].
361/// \param __B
362/// A 256-bit vector of [16 x unsigned short].
363/// \returns
364/// A 256-bit vector of [8 x int].
365///
366/// \code{.operation}
367/// FOR j := 0 to 7
368/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
369/// tmp2.dword :=
370/// ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
371/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
372/// ENDFOR
373/// dst[MAX:256] := 0
374/// \endcode
375#define _mm256_dpwuud_epi32(__W, __A, __B) \
376 ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v16hu)(__A), \
377 (__v16hu)(__B)))
378
379/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
380/// with corresponding unsigned 16-bit integers in \a __B, producing 2
381/// intermediate signed 16-bit results. Sum these 2 results with the
382/// corresponding 32-bit integer in \a __W with signed saturation, and store
383/// the packed 32-bit results in \a dst.
384///
385/// \headerfile <immintrin.h>
386///
387/// \code
388/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
389/// \endcode
390///
391/// This intrinsic corresponds to the \c VPDPWUUDS instruction.
392///
393/// \param __W
394/// A 128-bit vector of [4 x int].
395/// \param __A
396/// A 128-bit vector of [8 x unsigned short].
397/// \param __B
398/// A 128-bit vector of [8 x unsigned short].
399/// \returns
400/// A 128-bit vector of [4 x int].
401///
402/// \code{.operation}
403/// FOR j := 0 to 3
404/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
405/// tmp2.dword :=
406/// ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
407/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
408/// ENDFOR
409/// dst[MAX:128] := 0
410/// \endcode
411#define _mm_dpwuuds_epi32(__W, __A, __B) \
412 ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v8hu)(__A), \
413 (__v8hu)(__B)))
414
415/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
416/// with corresponding unsigned 16-bit integers in \a __B, producing 2
417/// intermediate signed 16-bit results. Sum these 2 results with the
418/// corresponding 32-bit integer in \a __W with signed saturation, and store
419/// the packed 32-bit results in \a dst.
420///
421/// \headerfile <immintrin.h>
422///
423/// \code
424/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
425/// \endcode
426///
427/// This intrinsic corresponds to the \c VPDPWUUDS instruction.
428///
429/// \param __W
430/// A 256-bit vector of [8 x int].
431/// \param __A
432/// A 256-bit vector of [16 x unsigned short].
433/// \param __B
434/// A 256-bit vector of [16 x unsigned short].
435/// \returns
436/// A 256-bit vector of [8 x int].
437///
438/// \code{.operation}
439/// FOR j := 0 to 7
440/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
441/// tmp2.dword :=
442/// ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
443/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
444/// ENDFOR
445/// dst[MAX:256] := 0
446/// \endcode
447#define _mm256_dpwuuds_epi32(__W, __A, __B) \
448 ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v16hu)(__A), \
449 (__v16hu)(__B)))
450
451#endif // __AVXVNNIINT16INTRIN_H