clang 19.0.0git
avxvnniint8intrin.h
Go to the documentation of this file.
1/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error \
11 "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXVNNIINT8INTRIN_H
15#define __AVXVNNIINT8INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
20 __min_vector_width__(256)))
21#define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
23 __min_vector_width__(128)))
24
25/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
26/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
27/// signed 16-bit results. Sum these 4 results with the corresponding
28/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
29///
30/// \headerfile <x86intrin.h>
31///
32/// \code
33/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
34/// \endcode
35///
36/// This intrinsic corresponds to the \c VPDPBSSD instruction.
37///
38/// \param __A
39/// A 128-bit vector of [16 x char].
40/// \param __B
41/// A 128-bit vector of [16 x char].
42/// \returns
43/// A 128-bit vector of [4 x int].
44///
45/// \code{.operation}
46/// FOR j := 0 to 3
47/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
48/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
49/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
50/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
51/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
52/// ENDFOR
53/// dst[MAX:128] := 0
54/// \endcode
55static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
56 __m128i __A,
57 __m128i __B) {
58 return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
59 (__v4si)__B);
60}
61
62/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
63/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
64/// signed 16-bit results. Sum these 4 results with the corresponding
65/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
66///
67/// \headerfile <x86intrin.h>
68///
69/// \code
70/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
71/// \endcode
72///
73/// This intrinsic corresponds to the \c VPDPBSSD instruction.
74///
75/// \param __A
76/// A 256-bit vector of [32 x char].
77/// \param __B
78/// A 256-bit vector of [32 x char].
79/// \returns
80/// A 256-bit vector of [8 x int].
81///
82/// \code{.operation}
83/// FOR j := 0 to 7
84/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
85/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
86/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
87/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
88/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
89/// ENDFOR
90/// dst[MAX:256] := 0
91/// \endcode
92static __inline__ __m256i __DEFAULT_FN_ATTRS256
93_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
94 return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
95 (__v8si)__B);
96}
97
98/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
99/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
100/// signed 16-bit results. Sum these 4 results with the corresponding
101/// 32-bit integer in \a __W with signed saturation, and store the packed
102/// 32-bit results in \a dst.
103///
104/// \headerfile <x86intrin.h>
105///
106/// \code
107/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
108/// \endcode
109///
110/// This intrinsic corresponds to the \c VPDPBSSD instruction.
111///
112/// \param __A
113/// A 128-bit vector of [16 x char].
114/// \param __B
115/// A 128-bit vector of [16 x char].
116/// \returns
117/// A 128-bit vector of [4 x int].
118///
119/// \code{.operation}
120/// FOR j := 0 to 3
121/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
122/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
123/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
124/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
125/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
126/// ENDFOR
127/// dst[MAX:128] := 0
128/// \endcode
129static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
130 __m128i __A,
131 __m128i __B) {
132 return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
133 (__v4si)__B);
134}
135
136/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
137/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
138/// signed 16-bit results. Sum these 4 results with the corresponding
139/// 32-bit integer in \a __W with signed saturation, and store the packed
140/// 32-bit results in \a dst.
141///
142/// \headerfile <x86intrin.h>
143///
144/// \code
145/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
146/// \endcode
147///
148/// This intrinsic corresponds to the \c VPDPBSSD instruction.
149///
150/// \param __A
151/// A 256-bit vector of [32 x char].
152/// \param __B
153/// A 256-bit vector of [32 x char].
154/// \returns
155/// A 256-bit vector of [8 x int].
156///
157/// \code{.operation}
158/// FOR j := 0 to 7
159/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
160/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
161/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
162/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
163/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
164/// ENDFOR
165/// dst[MAX:256] := 0
166/// \endcode
167static __inline__ __m256i __DEFAULT_FN_ATTRS256
168_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
169 return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
170 (__v8si)__B);
171}
172
173/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
174/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
175/// signed 16-bit results. Sum these 4 results with the corresponding
176/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
177///
178/// \headerfile <x86intrin.h>
179///
180/// \code
181/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
182/// \endcode
183///
184/// This intrinsic corresponds to the \c VPDPBSSD instruction.
185///
186/// \param __A
187/// A 128-bit vector of [16 x char].
188/// \param __B
189/// A 128-bit vector of [16 x unsigned char].
190/// \returns
191/// A 128-bit vector of [4 x int].
192///
193/// \code{.operation}
194/// FOR j := 0 to 3
195/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
196/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
197/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
198/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
199/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
200/// ENDFOR
201/// dst[MAX:128] := 0
202/// \endcode
203static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
204 __m128i __A,
205 __m128i __B) {
206 return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
207 (__v4si)__B);
208}
209
210/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
211/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
212/// signed 16-bit results. Sum these 4 results with the corresponding
213/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
214///
215/// \headerfile <x86intrin.h>
216///
217/// \code
218/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
219/// \endcode
220///
221/// This intrinsic corresponds to the \c VPDPBSSD instruction.
222///
223/// \param __A
224/// A 256-bit vector of [32 x char].
225/// \param __B
226/// A 256-bit vector of [32 x unsigned char].
227/// \returns
228/// A 256-bit vector of [8 x int].
229///
230/// \code{.operation}
231/// FOR j := 0 to 7
232/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
233/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
234/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
235/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
236/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
237/// ENDFOR
238/// dst[MAX:256] := 0
239/// \endcode
240static __inline__ __m256i __DEFAULT_FN_ATTRS256
241_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
242 return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
243 (__v8si)__B);
244}
245
246/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
247/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
248/// signed 16-bit results. Sum these 4 results with the corresponding
249/// 32-bit integer in \a __W with signed saturation, and store the packed
250/// 32-bit results in \a dst.
251///
252/// \headerfile <x86intrin.h>
253///
254/// \code
255/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
256/// \endcode
257///
258/// This intrinsic corresponds to the \c VPDPBSSD instruction.
259///
260/// \param __A
261/// A 128-bit vector of [16 x char].
262/// \param __B
263/// A 128-bit vector of [16 x unsigned char].
264/// \returns
265/// A 128-bit vector of [4 x int].
266///
267/// \code{.operation}
268/// FOR j := 0 to 3
269/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
270/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
271/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
272/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
273/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
274/// ENDFOR
275/// dst[MAX:128] := 0
276/// \endcode
277static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
278 __m128i __A,
279 __m128i __B) {
280 return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
281 (__v4si)__B);
282}
283
284/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
285/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
286/// signed 16-bit results. Sum these 4 results with the corresponding
287/// 32-bit integer in \a __W with signed saturation, and store the packed
288/// 32-bit results in \a dst.
289///
290/// \headerfile <x86intrin.h>
291///
292/// \code
293/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
294/// \endcode
295///
296/// This intrinsic corresponds to the \c VPDPBSSD instruction.
297///
298/// \param __A
299/// A 256-bit vector of [32 x char].
300/// \param __B
301/// A 256-bit vector of [32 x unsigned char].
302/// \returns
303/// A 256-bit vector of [8 x int].
304///
305/// \code{.operation}
306/// FOR j := 0 to 7
307/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
308/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
309/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
310/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
311/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
312/// ENDFOR
313/// dst[MAX:256] := 0
314/// \endcode
315static __inline__ __m256i __DEFAULT_FN_ATTRS256
316_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
317 return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
318 (__v8si)__B);
319}
320
321/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
322/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
323/// signed 16-bit results. Sum these 4 results with the corresponding
324/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
325///
326/// \headerfile <x86intrin.h>
327///
328/// \code
329/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
330/// \endcode
331///
332/// This intrinsic corresponds to the \c VPDPBSSD instruction.
333///
334/// \param __A
335/// A 128-bit vector of [16 x unsigned char].
336/// \param __B
337/// A 128-bit vector of [16 x unsigned char].
338/// \returns
339/// A 128-bit vector of [4 x int].
340///
341/// \code{.operation}
342/// FOR j := 0 to 3
343/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
344/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
345/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
346/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
347/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
348/// ENDFOR
349/// dst[MAX:128] := 0
350/// \endcode
351static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
352 __m128i __A,
353 __m128i __B) {
354 return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
355 (__v4si)__B);
356}
357
358/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
359/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
360/// signed 16-bit results. Sum these 4 results with the corresponding
361/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
362///
363/// \headerfile <x86intrin.h>
364///
365/// \code
366/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
367/// \endcode
368///
369/// This intrinsic corresponds to the \c VPDPBSSD instruction.
370///
371/// \param __A
372/// A 256-bit vector of [32 x unsigned char].
373/// \param __B
374/// A 256-bit vector of [32 x unsigned char].
375/// \returns
376/// A 256-bit vector of [8 x int].
377///
378/// \code{.operation}
379/// FOR j := 0 to 7
380/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
381/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
382/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
383/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
384/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
385/// ENDFOR
386/// dst[MAX:256] := 0
387/// \endcode
388static __inline__ __m256i __DEFAULT_FN_ATTRS256
389_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
390 return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
391 (__v8si)__B);
392}
393
394/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
395/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
396/// signed 16-bit results. Sum these 4 results with the corresponding
397/// 32-bit integer in \a __W with signed saturation, and store the packed
398/// 32-bit results in \a dst.
399///
400/// \headerfile <x86intrin.h>
401///
402/// \code
403/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
404/// \endcode
405///
406/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
407///
408/// \param __A
409/// A 128-bit vector of [16 x unsigned char].
410/// \param __B
411/// A 128-bit vector of [16 x unsigned char].
412/// \returns
413/// A 128-bit vector of [4 x int].
414///
415/// \code{.operation}
416/// FOR j := 0 to 3
417/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
418/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
419/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
420/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
421/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
422/// ENDFOR
423/// dst[MAX:128] := 0
424/// \endcode
425static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
426 __m128i __A,
427 __m128i __B) {
428 return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
429 (__v4si)__B);
430}
431
432/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
433/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
434/// signed 16-bit results. Sum these 4 results with the corresponding
435/// 32-bit integer in \a __W with signed saturation, and store the packed
436/// 32-bit results in \a dst.
437///
438/// \headerfile <x86intrin.h>
439///
440/// \code
441/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
442/// \endcode
443///
444/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
445///
446/// \param __A
447/// A 256-bit vector of [32 x unsigned char].
448/// \param __B
449/// A 256-bit vector of [32 x unsigned char].
450/// \returns
451/// A 256-bit vector of [8 x int].
452///
453/// \code{.operation}
454/// FOR j := 0 to 7
455/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
456/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
457/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
458/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
459/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
460/// ENDFOR
461/// dst[MAX:256] := 0
462/// \endcode
463static __inline__ __m256i __DEFAULT_FN_ATTRS256
464_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
465 return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
466 (__v8si)__B);
467}
468#undef __DEFAULT_FN_ATTRS128
469#undef __DEFAULT_FN_ATTRS256
470
471#endif // __AVXVNNIINT8INTRIN_H
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define __DEFAULT_FN_ATTRS128
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...