clang 22.0.0git
avxvnniint8intrin.h
Go to the documentation of this file.
1/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error \
11 "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXVNNIINT8INTRIN_H
15#define __AVXVNNIINT8INTRIN_H
16
17// clang-format off
18/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
19/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
20/// signed 16-bit results. Sum these 4 results with the corresponding
21/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
22///
23/// \headerfile <x86intrin.h>
24///
25/// \code
26/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
27/// \endcode
28///
29/// This intrinsic corresponds to the \c VPDPBSSD instruction.
30///
31/// \param __A
32/// A 128-bit vector of [16 x char].
33/// \param __B
34/// A 128-bit vector of [16 x char].
35/// \returns
36/// A 128-bit vector of [4 x int].
37///
38/// \code{.operation}
39/// FOR j := 0 to 3
40/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
41/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
42/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
43/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
44/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
45/// ENDFOR
46/// dst[MAX:128] := 0
47/// \endcode
48// clang-format on
49#define _mm_dpbssd_epi32(__W, __A, __B) \
50 ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v16qi)(__A), \
51 (__v16qi)(__B)))
52
53// clang-format off
54/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
55/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
56/// signed 16-bit results. Sum these 4 results with the corresponding
57/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
58///
59/// \headerfile <x86intrin.h>
60///
61/// \code
62/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
63/// \endcode
64///
65/// This intrinsic corresponds to the \c VPDPBSSD instruction.
66///
67/// \param __A
68/// A 256-bit vector of [32 x char].
69/// \param __B
70/// A 256-bit vector of [32 x char].
71/// \returns
72/// A 256-bit vector of [8 x int].
73///
74/// \code{.operation}
75/// FOR j := 0 to 7
76/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
77/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
78/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
79/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
80/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
81/// ENDFOR
82/// dst[MAX:256] := 0
83/// \endcode
84// clang-format on
85#define _mm256_dpbssd_epi32(__W, __A, __B) \
86 ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v32qi)(__A), \
87 (__v32qi)(__B)))
88
89// clang-format off
90/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
91/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
92/// signed 16-bit results. Sum these 4 results with the corresponding
93/// 32-bit integer in \a __W with signed saturation, and store the packed
94/// 32-bit results in \a dst.
95///
96/// \headerfile <x86intrin.h>
97///
98/// \code
99/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
100/// \endcode
101///
102/// This intrinsic corresponds to the \c VPDPBSSDS instruction.
103///
104/// \param __A
105/// A 128-bit vector of [16 x char].
106/// \param __B
107/// A 128-bit vector of [16 x char].
108/// \returns
109/// A 128-bit vector of [4 x int].
110///
111/// \code{.operation}
112/// FOR j := 0 to 3
113/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
114/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
115/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
116/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
117/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
118/// ENDFOR
119/// dst[MAX:128] := 0
120/// \endcode
121// clang-format on
122#define _mm_dpbssds_epi32(__W, __A, __B) \
123 ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v16qi)(__A), \
124 (__v16qi)(__B)))
125
126// clang-format off
127/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
128/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
129/// signed 16-bit results. Sum these 4 results with the corresponding
130/// 32-bit integer in \a __W with signed saturation, and store the packed
131/// 32-bit results in \a dst.
132///
133/// \headerfile <x86intrin.h>
134///
135/// \code
136/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
137/// \endcode
138///
139/// This intrinsic corresponds to the \c VPDPBSSDS instruction.
140///
141/// \param __A
142/// A 256-bit vector of [32 x char].
143/// \param __B
144/// A 256-bit vector of [32 x char].
145/// \returns
146/// A 256-bit vector of [8 x int].
147///
148/// \code{.operation}
149/// FOR j := 0 to 7
150/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
151/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
152/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
153/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
154/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
155/// ENDFOR
156/// dst[MAX:256] := 0
157/// \endcode
158// clang-format on
159#define _mm256_dpbssds_epi32(__W, __A, __B) \
160 ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v32qi)(__A), \
161 (__v32qi)(__B)))
162
163// clang-format off
164/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
165/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
166/// signed 16-bit results. Sum these 4 results with the corresponding
167/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
168///
169/// \headerfile <x86intrin.h>
170///
171/// \code
172/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
173/// \endcode
174///
175/// This intrinsic corresponds to the \c VPDPBSUD instruction.
176///
177/// \param __A
178/// A 128-bit vector of [16 x char].
179/// \param __B
180/// A 128-bit vector of [16 x unsigned char].
181/// \returns
182/// A 128-bit vector of [4 x int].
183///
184/// \code{.operation}
185/// FOR j := 0 to 3
186/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
187/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
188/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
189/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
190/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
191/// ENDFOR
192/// dst[MAX:128] := 0
193/// \endcode
194// clang-format on
195#define _mm_dpbsud_epi32(__W, __A, __B) \
196 ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v16qi)(__A), \
197 (__v16qu)(__B)))
198
199// clang-format off
200/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
201/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
202/// signed 16-bit results. Sum these 4 results with the corresponding
203/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
204///
205/// \headerfile <x86intrin.h>
206///
207/// \code
208/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
209/// \endcode
210///
211/// This intrinsic corresponds to the \c VPDPBSUD instruction.
212///
213/// \param __A
214/// A 256-bit vector of [32 x char].
215/// \param __B
216/// A 256-bit vector of [32 x unsigned char].
217/// \returns
218/// A 256-bit vector of [8 x int].
219///
220/// \code{.operation}
221/// FOR j := 0 to 7
222/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
223/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
224/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
225/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
226/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
227/// ENDFOR
228/// dst[MAX:256] := 0
229/// \endcode
230// clang-format on
231#define _mm256_dpbsud_epi32(__W, __A, __B) \
232 ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v32qi)(__A), \
233 (__v32qu)(__B)))
234
235// clang-format off
236/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
237/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
238/// signed 16-bit results. Sum these 4 results with the corresponding
239/// 32-bit integer in \a __W with signed saturation, and store the packed
240/// 32-bit results in \a dst.
241///
242/// \headerfile <x86intrin.h>
243///
244/// \code
245/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
246/// \endcode
247///
248/// This intrinsic corresponds to the \c VPDPBSUDS instruction.
249///
250/// \param __A
251/// A 128-bit vector of [16 x char].
252/// \param __B
253/// A 128-bit vector of [16 x unsigned char].
254/// \returns
255/// A 128-bit vector of [4 x int].
256///
257/// \code{.operation}
258/// FOR j := 0 to 3
259/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
260/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
261/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
262/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
263/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
264/// ENDFOR
265/// dst[MAX:128] := 0
266/// \endcode
267// clang-format on
268#define _mm_dpbsuds_epi32(__W, __A, __B) \
269 ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v16qi)(__A), \
270 (__v16qu)(__B)))
271
272// clang-format off
273/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
274/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
275/// signed 16-bit results. Sum these 4 results with the corresponding
276/// 32-bit integer in \a __W with signed saturation, and store the packed
277/// 32-bit results in \a dst.
278///
279/// \headerfile <x86intrin.h>
280///
281/// \code
282/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
283/// \endcode
284///
285/// This intrinsic corresponds to the \c VPDPBSUDS instruction.
286///
287/// \param __A
288/// A 256-bit vector of [32 x char].
289/// \param __B
290/// A 256-bit vector of [32 x unsigned char].
291/// \returns
292/// A 256-bit vector of [8 x int].
293///
294/// \code{.operation}
295/// FOR j := 0 to 7
296/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
297/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
298/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
299/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
300/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
301/// ENDFOR
302/// dst[MAX:256] := 0
303/// \endcode
304// clang-format on
305#define _mm256_dpbsuds_epi32(__W, __A, __B) \
306 ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v32qi)(__A), \
307 (__v32qu)(__B)))
308
309// clang-format off
310/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
311/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
312/// signed 16-bit results. Sum these 4 results with the corresponding
313/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
314///
315/// \headerfile <x86intrin.h>
316///
317/// \code
318/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
319/// \endcode
320///
321/// This intrinsic corresponds to the \c VPDPBUUD instruction.
322///
323/// \param __A
324/// A 128-bit vector of [16 x unsigned char].
325/// \param __B
326/// A 128-bit vector of [16 x unsigned char].
327/// \returns
328/// A 128-bit vector of [4 x int].
329///
330/// \code{.operation}
331/// FOR j := 0 to 3
332/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
333/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
334/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
335/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
336/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
337/// ENDFOR
338/// dst[MAX:128] := 0
339/// \endcode
340// clang-format on
341#define _mm_dpbuud_epi32(__W, __A, __B) \
342 ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v16qu)(__A), \
343 (__v16qu)(__B)))
344
345// clang-format off
346/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
347/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
348/// signed 16-bit results. Sum these 4 results with the corresponding
349/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
350///
351/// \headerfile <x86intrin.h>
352///
353/// \code
354/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
355/// \endcode
356///
357/// This intrinsic corresponds to the \c VPDPBUUD instruction.
358///
359/// \param __A
360/// A 256-bit vector of [32 x unsigned char].
361/// \param __B
362/// A 256-bit vector of [32 x unsigned char].
363/// \returns
364/// A 256-bit vector of [8 x int].
365///
366/// \code{.operation}
367/// FOR j := 0 to 7
368/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
369/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
370/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
371/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
372/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
373/// ENDFOR
374/// dst[MAX:256] := 0
375/// \endcode
376// clang-format on
377#define _mm256_dpbuud_epi32(__W, __A, __B) \
378 ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v32qu)(__A), \
379 (__v32qu)(__B)))
380
381// clang-format off
382/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
383/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
384/// signed 16-bit results. Sum these 4 results with the corresponding
385/// 32-bit integer in \a __W with signed saturation, and store the packed
386/// 32-bit results in \a dst.
387///
388/// \headerfile <x86intrin.h>
389///
390/// \code
391/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
392/// \endcode
393///
394/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
395///
396/// \param __A
397/// A 128-bit vector of [16 x unsigned char].
398/// \param __B
399/// A 128-bit vector of [16 x unsigned char].
400/// \returns
401/// A 128-bit vector of [4 x int].
402///
403/// \code{.operation}
404/// FOR j := 0 to 3
405/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
406/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
407/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
408/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
409/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
410/// ENDFOR
411/// dst[MAX:128] := 0
412/// \endcode
413// clang-format on
414#define _mm_dpbuuds_epi32(__W, __A, __B) \
415 ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v16qu)(__A), \
416 (__v16qu)(__B)))
417
418// clang-format off
419/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
420/// signed 16-bit results. Sum these 4 results with the corresponding
421/// 32-bit integer in \a __W with signed saturation, and store the packed
422/// 32-bit results in \a dst.
423///
424/// \headerfile <x86intrin.h>
425///
426/// \code
427/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
428/// \endcode
429///
430/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
431///
432/// \param __A
433/// A 256-bit vector of [32 x unsigned char].
434/// \param __B
435/// A 256-bit vector of [32 x unsigned char].
436/// \returns
437/// A 256-bit vector of [8 x int].
438///
439/// \code{.operation}
440/// FOR j := 0 to 7
441/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
442/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
443/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
444/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
445/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
446/// ENDFOR
447/// dst[MAX:256] := 0
448/// \endcode
449// clang-format on
450#define _mm256_dpbuuds_epi32(__W, __A, __B) \
451 ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v32qu)(__A), \
452 (__v32qu)(__B)))
453
454#endif // __AVXVNNIINT8INTRIN_H