clang 22.0.0git
avx2intrin.h
Go to the documentation of this file.
1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX2INTRIN_H
15#define __AVX2INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
20 __min_vector_width__(256)))
21#define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
23 __min_vector_width__(128)))
24
25#if defined(__cplusplus) && (__cplusplus >= 201103L)
26#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
27#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
28#else
29#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
30#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
31#endif
32
33/* SSE4 Multiple Packed Sums of Absolute Difference. */
34/// Computes sixteen sum of absolute difference (SAD) operations on sets of
35/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
36/// \a Y.
37///
38/// Eight SAD results are computed using the lower half of the input
39/// vectors, and another eight using the upper half. These 16-bit values
40/// are returned in the lower and upper halves of the 256-bit result,
41/// respectively.
42///
43/// A single SAD operation selects four bytes from \a X and four bytes from
44/// \a Y as input. It computes the differences between each \a X byte and
45/// the corresponding \a Y byte, takes the absolute value of each
46/// difference, and sums these four values to form one 16-bit result. The
47/// intrinsic computes 16 of these results with different sets of input
48/// bytes.
49///
50/// For each set of eight results, the SAD operations use the same four
51/// bytes from \a Y; the starting bit position for these four bytes is
52/// specified by \a M[1:0] times 32. The eight operations use successive
53/// sets of four bytes from \a X; the starting bit position for the first
54/// set of four bytes is specified by \a M[2] times 32. These bit positions
55/// are all relative to the 128-bit lane for each set of eight operations.
56///
57/// \code{.operation}
58/// r := 0
59/// FOR i := 0 TO 1
60/// j := i*3
61/// Ybase := M[j+1:j]*32 + i*128
62/// Xbase := M[j+2]*32 + i*128
63/// FOR k := 0 TO 3
64/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
65/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
66/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
67/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
68/// result[r+15:r] := temp0 + temp1 + temp2 + temp3
69/// Xbase := Xbase + 8
70/// r := r + 16
71/// ENDFOR
72/// ENDFOR
73/// \endcode
74///
75/// \headerfile <immintrin.h>
76///
77/// \code
78/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
79/// \endcode
80///
81/// This intrinsic corresponds to the \c VMPSADBW instruction.
82///
83/// \param X
84/// A 256-bit integer vector containing one of the inputs.
85/// \param Y
86/// A 256-bit integer vector containing one of the inputs.
87/// \param M
88/// An unsigned immediate value specifying the starting positions of the
89/// bytes to operate on.
90/// \returns A 256-bit vector of [16 x i16] containing the result.
91#define _mm256_mpsadbw_epu8(X, Y, M) \
92 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
93 (__v32qi)(__m256i)(Y), (int)(M)))
94
95/// Computes the absolute value of each signed byte in the 256-bit integer
96/// vector \a __a and returns each value in the corresponding byte of
97/// the result.
98///
99/// \headerfile <immintrin.h>
100///
101/// This intrinsic corresponds to the \c VPABSB instruction.
102///
103/// \param __a
104/// A 256-bit integer vector.
105/// \returns A 256-bit integer vector containing the result.
106static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
108 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
109}
110
111/// Computes the absolute value of each signed 16-bit element in the 256-bit
112/// vector of [16 x i16] in \a __a and returns each value in the
113/// corresponding element of the result.
114///
115/// \headerfile <immintrin.h>
116///
117/// This intrinsic corresponds to the \c VPABSW instruction.
118///
119/// \param __a
120/// A 256-bit vector of [16 x i16].
121/// \returns A 256-bit vector of [16 x i16] containing the result.
122static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
124 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
125}
126
127/// Computes the absolute value of each signed 32-bit element in the 256-bit
128/// vector of [8 x i32] in \a __a and returns each value in the
129/// corresponding element of the result.
130///
131/// \headerfile <immintrin.h>
132///
133/// This intrinsic corresponds to the \c VPABSD instruction.
134///
135/// \param __a
136/// A 256-bit vector of [8 x i32].
137/// \returns A 256-bit vector of [8 x i32] containing the result.
138static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
140 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
141}
142
143/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
144/// integers using signed saturation, and returns the 256-bit result.
145///
146/// \code{.operation}
147/// FOR i := 0 TO 7
148/// j := i*16
149/// k := i*8
150/// result[7+k:k] := SATURATE8(__a[15+j:j])
151/// result[71+k:64+k] := SATURATE8(__b[15+j:j])
152/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
153/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
154/// ENDFOR
155/// \endcode
156///
157/// \headerfile <immintrin.h>
158///
159/// This intrinsic corresponds to the \c VPACKSSWB instruction.
160///
161/// \param __a
162/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
163/// result[191:128].
164/// \param __b
165/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
166/// result[255:192].
167/// \returns A 256-bit integer vector containing the result.
168static __inline__ __m256i __DEFAULT_FN_ATTRS256
169_mm256_packs_epi16(__m256i __a, __m256i __b)
170{
171 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
172}
173
174/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
175/// integers using signed saturation, and returns the resulting 256-bit
176/// vector of [16 x i16].
177///
178/// \code{.operation}
179/// FOR i := 0 TO 3
180/// j := i*32
181/// k := i*16
182/// result[15+k:k] := SATURATE16(__a[31+j:j])
183/// result[79+k:64+k] := SATURATE16(__b[31+j:j])
184/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
185/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
186/// ENDFOR
187/// \endcode
188///
189/// \headerfile <immintrin.h>
190///
191/// This intrinsic corresponds to the \c VPACKSSDW instruction.
192///
193/// \param __a
194/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
195/// result[191:128].
196/// \param __b
197/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
198/// result[255:192].
199/// \returns A 256-bit vector of [16 x i16] containing the result.
200static __inline__ __m256i __DEFAULT_FN_ATTRS256
201_mm256_packs_epi32(__m256i __a, __m256i __b)
202{
203 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
204}
205
206/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
207/// using unsigned saturation, and returns the 256-bit result.
208///
209/// \code{.operation}
210/// FOR i := 0 TO 7
211/// j := i*16
212/// k := i*8
213/// result[7+k:k] := SATURATE8U(__a[15+j:j])
214/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
215/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
216/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
217/// ENDFOR
218/// \endcode
219///
220/// \headerfile <immintrin.h>
221///
222/// This intrinsic corresponds to the \c VPACKUSWB instruction.
223///
224/// \param __a
225/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
226/// result[191:128].
227/// \param __b
228/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
229/// result[255:192].
230/// \returns A 256-bit integer vector containing the result.
231static __inline__ __m256i __DEFAULT_FN_ATTRS256
232_mm256_packus_epi16(__m256i __a, __m256i __b)
233{
234 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
235}
236
237/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
238/// using unsigned saturation, and returns the resulting 256-bit vector of
239/// [16 x i16].
240///
241/// \code{.operation}
242/// FOR i := 0 TO 3
243/// j := i*32
244/// k := i*16
245/// result[15+k:k] := SATURATE16U(__V1[31+j:j])
246/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
247/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
248/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
249/// ENDFOR
250/// \endcode
251///
252/// \headerfile <immintrin.h>
253///
254/// This intrinsic corresponds to the \c VPACKUSDW instruction.
255///
256/// \param __V1
257/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
258/// result[191:128].
259/// \param __V2
260/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
261/// result[255:192].
262/// \returns A 256-bit vector of [16 x i16] containing the result.
263static __inline__ __m256i __DEFAULT_FN_ATTRS256
264_mm256_packus_epi32(__m256i __V1, __m256i __V2)
265{
266 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
267}
268
269/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
270/// vectors and returns the lower 8 bits of each sum in the corresponding
271/// byte of the 256-bit integer vector result (overflow is ignored).
272///
273/// \headerfile <immintrin.h>
274///
275/// This intrinsic corresponds to the \c VPADDB instruction.
276///
277/// \param __a
278/// A 256-bit integer vector containing one of the source operands.
279/// \param __b
280/// A 256-bit integer vector containing one of the source operands.
281/// \returns A 256-bit integer vector containing the sums.
282static __inline__ __m256i __DEFAULT_FN_ATTRS256
283_mm256_add_epi8(__m256i __a, __m256i __b)
284{
285 return (__m256i)((__v32qu)__a + (__v32qu)__b);
286}
287
288/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
289/// [16 x i16] and returns the lower 16 bits of each sum in the
290/// corresponding element of the [16 x i16] result (overflow is ignored).
291///
292/// \headerfile <immintrin.h>
293///
294/// This intrinsic corresponds to the \c VPADDW instruction.
295///
296/// \param __a
297/// A 256-bit vector of [16 x i16] containing one of the source operands.
298/// \param __b
299/// A 256-bit vector of [16 x i16] containing one of the source operands.
300/// \returns A 256-bit vector of [16 x i16] containing the sums.
301static __inline__ __m256i __DEFAULT_FN_ATTRS256
302_mm256_add_epi16(__m256i __a, __m256i __b)
303{
304 return (__m256i)((__v16hu)__a + (__v16hu)__b);
305}
306
307/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
308/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
309/// element of the [8 x i32] result (overflow is ignored).
310///
311/// \headerfile <immintrin.h>
312///
313/// This intrinsic corresponds to the \c VPADDD instruction.
314///
315/// \param __a
316/// A 256-bit vector of [8 x i32] containing one of the source operands.
317/// \param __b
318/// A 256-bit vector of [8 x i32] containing one of the source operands.
319/// \returns A 256-bit vector of [8 x i32] containing the sums.
320static __inline__ __m256i __DEFAULT_FN_ATTRS256
321_mm256_add_epi32(__m256i __a, __m256i __b)
322{
323 return (__m256i)((__v8su)__a + (__v8su)__b);
324}
325
326/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
327/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
328/// element of the [4 x i64] result (overflow is ignored).
329///
330/// \headerfile <immintrin.h>
331///
332/// This intrinsic corresponds to the \c VPADDQ instruction.
333///
334/// \param __a
335/// A 256-bit vector of [4 x i64] containing one of the source operands.
336/// \param __b
337/// A 256-bit vector of [4 x i64] containing one of the source operands.
338/// \returns A 256-bit vector of [4 x i64] containing the sums.
339static __inline__ __m256i __DEFAULT_FN_ATTRS256
340_mm256_add_epi64(__m256i __a, __m256i __b)
341{
342 return (__m256i)((__v4du)__a + (__v4du)__b);
343}
344
345/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
346/// vectors using signed saturation, and returns each sum in the
347/// corresponding byte of the 256-bit integer vector result.
348///
349/// \headerfile <immintrin.h>
350///
351/// This intrinsic corresponds to the \c VPADDSB instruction.
352///
353/// \param __a
354/// A 256-bit integer vector containing one of the source operands.
355/// \param __b
356/// A 256-bit integer vector containing one of the source operands.
357/// \returns A 256-bit integer vector containing the sums.
358static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
359_mm256_adds_epi8(__m256i __a, __m256i __b) {
360 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
361}
362
363/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
364/// [16 x i16] using signed saturation, and returns the [16 x i16] result.
365///
366/// \headerfile <immintrin.h>
367///
368/// This intrinsic corresponds to the \c VPADDSW instruction.
369///
370/// \param __a
371/// A 256-bit vector of [16 x i16] containing one of the source operands.
372/// \param __b
373/// A 256-bit vector of [16 x i16] containing one of the source operands.
374/// \returns A 256-bit vector of [16 x i16] containing the sums.
375static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
376_mm256_adds_epi16(__m256i __a, __m256i __b) {
377 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
378}
379
380/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
381/// vectors using unsigned saturation, and returns each sum in the
382/// corresponding byte of the 256-bit integer vector result.
383///
384/// \headerfile <immintrin.h>
385///
386/// This intrinsic corresponds to the \c VPADDUSB instruction.
387///
388/// \param __a
389/// A 256-bit integer vector containing one of the source operands.
390/// \param __b
391/// A 256-bit integer vector containing one of the source operands.
392/// \returns A 256-bit integer vector containing the sums.
393static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
394_mm256_adds_epu8(__m256i __a, __m256i __b) {
395 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
396}
397
398/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
399/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
400///
401/// \headerfile <immintrin.h>
402///
403/// This intrinsic corresponds to the \c VPADDUSW instruction.
404///
405/// \param __a
406/// A 256-bit vector of [16 x i16] containing one of the source operands.
407/// \param __b
408/// A 256-bit vector of [16 x i16] containing one of the source operands.
409/// \returns A 256-bit vector of [16 x i16] containing the sums.
410static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
411_mm256_adds_epu16(__m256i __a, __m256i __b) {
412 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
413}
414
415/// Uses the lower half of the 256-bit vector \a a as the upper half of a
416/// temporary 256-bit value, and the lower half of the 256-bit vector \a b
417/// as the lower half of the temporary value. Right-shifts the temporary
418/// value by \a n bytes, and uses the lower 16 bytes of the shifted value
419/// as the lower 16 bytes of the result. Uses the upper halves of \a a and
420/// \a b to make another temporary value, right shifts by \a n, and uses
421/// the lower 16 bytes of the shifted value as the upper 16 bytes of the
422/// result.
423///
424/// \headerfile <immintrin.h>
425///
426/// \code
427/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
428/// \endcode
429///
430/// This intrinsic corresponds to the \c VPALIGNR instruction.
431///
432/// \param a
433/// A 256-bit integer vector containing source values.
434/// \param b
435/// A 256-bit integer vector containing source values.
436/// \param n
437/// An immediate value specifying the number of bytes to shift.
438/// \returns A 256-bit integer vector containing the result.
439#define _mm256_alignr_epi8(a, b, n) \
440 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
441 (__v32qi)(__m256i)(b), (n)))
442
443/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
444/// \a __b.
445///
446/// \headerfile <immintrin.h>
447///
448/// This intrinsic corresponds to the \c VPAND instruction.
449///
450/// \param __a
451/// A 256-bit integer vector.
452/// \param __b
453/// A 256-bit integer vector.
454/// \returns A 256-bit integer vector containing the result.
455static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
456_mm256_and_si256(__m256i __a, __m256i __b)
457{
458 return (__m256i)((__v4du)__a & (__v4du)__b);
459}
460
461/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
462/// the bitwise NOT of the 256-bit integer vector in \a __a.
463///
464/// \headerfile <immintrin.h>
465///
466/// This intrinsic corresponds to the \c VPANDN instruction.
467///
468/// \param __a
469/// A 256-bit integer vector.
470/// \param __b
471/// A 256-bit integer vector.
472/// \returns A 256-bit integer vector containing the result.
473static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
474_mm256_andnot_si256(__m256i __a, __m256i __b)
475{
476 return (__m256i)(~(__v4du)__a & (__v4du)__b);
477}
478
479/// Computes the averages of the corresponding unsigned bytes in the two
480/// 256-bit integer vectors in \a __a and \a __b and returns each
481/// average in the corresponding byte of the 256-bit result.
482///
483/// \code{.operation}
484/// FOR i := 0 TO 31
485/// j := i*8
486/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
487/// ENDFOR
488/// \endcode
489///
490/// \headerfile <immintrin.h>
491///
492/// This intrinsic corresponds to the \c VPAVGB instruction.
493///
494/// \param __a
495/// A 256-bit integer vector.
496/// \param __b
497/// A 256-bit integer vector.
498/// \returns A 256-bit integer vector containing the result.
499static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
500_mm256_avg_epu8(__m256i __a, __m256i __b) {
501 return (__m256i)__builtin_ia32_pavgb256((__v32qu)__a, (__v32qu)__b);
502}
503
504/// Computes the averages of the corresponding unsigned 16-bit integers in
505/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
506/// each average in the corresponding element of the 256-bit result.
507///
508/// \code{.operation}
509/// FOR i := 0 TO 15
510/// j := i*16
511/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
512/// ENDFOR
513/// \endcode
514///
515/// \headerfile <immintrin.h>
516///
517/// This intrinsic corresponds to the \c VPAVGW instruction.
518///
519/// \param __a
520/// A 256-bit vector of [16 x i16].
521/// \param __b
522/// A 256-bit vector of [16 x i16].
523/// \returns A 256-bit vector of [16 x i16] containing the result.
524static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
525_mm256_avg_epu16(__m256i __a, __m256i __b) {
526 return (__m256i)__builtin_ia32_pavgw256((__v16hu)__a, (__v16hu)__b);
527}
528
529/// Merges 8-bit integer values from either of the two 256-bit vectors
530/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
531/// the resulting 256-bit integer vector.
532///
533/// \code{.operation}
534/// FOR i := 0 TO 31
535/// j := i*8
536/// IF __M[7+i] == 0
537/// result[7+j:j] := __V1[7+j:j]
538/// ELSE
539/// result[7+j:j] := __V2[7+j:j]
540/// FI
541/// ENDFOR
542/// \endcode
543///
544/// \headerfile <immintrin.h>
545///
546/// This intrinsic corresponds to the \c VPBLENDVB instruction.
547///
548/// \param __V1
549/// A 256-bit integer vector containing source values.
550/// \param __V2
551/// A 256-bit integer vector containing source values.
552/// \param __M
553/// A 256-bit integer vector, with bit [7] of each byte specifying the
554/// source for each corresponding byte of the result. When the mask bit
555/// is 0, the byte is copied from \a __V1; otherwise, it is copied from
556/// \a __V2.
557/// \returns A 256-bit integer vector containing the result.
558static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
559_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) {
560 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
561 (__v32qi)__M);
562}
563
564/// Merges 16-bit integer values from either of the two 256-bit vectors
565/// \a V1 or \a V2, as specified by the immediate integer operand \a M,
566/// and returns the resulting 256-bit vector of [16 x i16].
567///
568/// \code{.operation}
569/// FOR i := 0 TO 7
570/// j := i*16
571/// IF M[i] == 0
572/// result[7+j:j] := V1[7+j:j]
573/// result[135+j:128+j] := V1[135+j:128+j]
574/// ELSE
575/// result[7+j:j] := V2[7+j:j]
576/// result[135+j:128+j] := V2[135+j:128+j]
577/// FI
578/// ENDFOR
579/// \endcode
580///
581/// \headerfile <immintrin.h>
582///
583/// \code
584/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
585/// \endcode
586///
587/// This intrinsic corresponds to the \c VPBLENDW instruction.
588///
589/// \param V1
590/// A 256-bit vector of [16 x i16] containing source values.
591/// \param V2
592/// A 256-bit vector of [16 x i16] containing source values.
593/// \param M
594/// An immediate 8-bit integer operand, with bits [7:0] specifying the
595/// source for each element of the result. The position of the mask bit
596/// corresponds to the index of a copied value. When a mask bit is 0, the
597/// element is copied from \a V1; otherwise, it is copied from \a V2.
598/// \a M[0] determines the source for elements 0 and 8, \a M[1] for
599/// elements 1 and 9, and so forth.
600/// \returns A 256-bit vector of [16 x i16] containing the result.
601#define _mm256_blend_epi16(V1, V2, M) \
602 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
603 (__v16hi)(__m256i)(V2), (int)(M)))
604
605/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
606/// \a __b for equality and returns the outcomes in the corresponding
607/// bytes of the 256-bit result.
608///
609/// \code{.operation}
610/// FOR i := 0 TO 31
611/// j := i*8
612/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
613/// ENDFOR
614/// \endcode
615///
616/// \headerfile <immintrin.h>
617///
618/// This intrinsic corresponds to the \c VPCMPEQB instruction.
619///
620/// \param __a
621/// A 256-bit integer vector containing one of the inputs.
622/// \param __b
623/// A 256-bit integer vector containing one of the inputs.
624/// \returns A 256-bit integer vector containing the result.
625static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
626_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
627{
628 return (__m256i)((__v32qi)__a == (__v32qi)__b);
629}
630
631/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
632/// \a __a and \a __b for equality and returns the outcomes in the
633/// corresponding elements of the 256-bit result.
634///
635/// \code{.operation}
636/// FOR i := 0 TO 15
637/// j := i*16
638/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
639/// ENDFOR
640/// \endcode
641///
642/// \headerfile <immintrin.h>
643///
644/// This intrinsic corresponds to the \c VPCMPEQW instruction.
645///
646/// \param __a
647/// A 256-bit vector of [16 x i16] containing one of the inputs.
648/// \param __b
649/// A 256-bit vector of [16 x i16] containing one of the inputs.
650/// \returns A 256-bit vector of [16 x i16] containing the result.
651static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
652_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
653{
654 return (__m256i)((__v16hi)__a == (__v16hi)__b);
655}
656
657/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
658/// \a __a and \a __b for equality and returns the outcomes in the
659/// corresponding elements of the 256-bit result.
660///
661/// \code{.operation}
662/// FOR i := 0 TO 7
663/// j := i*32
664/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
665/// ENDFOR
666/// \endcode
667///
668/// \headerfile <immintrin.h>
669///
670/// This intrinsic corresponds to the \c VPCMPEQD instruction.
671///
672/// \param __a
673/// A 256-bit vector of [8 x i32] containing one of the inputs.
674/// \param __b
675/// A 256-bit vector of [8 x i32] containing one of the inputs.
676/// \returns A 256-bit vector of [8 x i32] containing the result.
677static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
678_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
679{
680 return (__m256i)((__v8si)__a == (__v8si)__b);
681}
682
683/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
684/// \a __a and \a __b for equality and returns the outcomes in the
685/// corresponding elements of the 256-bit result.
686///
687/// \code{.operation}
688/// FOR i := 0 TO 3
689/// j := i*64
690/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
691/// ENDFOR
692/// \endcode
693///
694/// \headerfile <immintrin.h>
695///
696/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
697///
698/// \param __a
699/// A 256-bit vector of [4 x i64] containing one of the inputs.
700/// \param __b
701/// A 256-bit vector of [4 x i64] containing one of the inputs.
702/// \returns A 256-bit vector of [4 x i64] containing the result.
703static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
704_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
705{
706 return (__m256i)((__v4di)__a == (__v4di)__b);
707}
708
709/// Compares corresponding signed bytes in the 256-bit integer vectors in
710/// \a __a and \a __b for greater-than and returns the outcomes in the
711/// corresponding bytes of the 256-bit result.
712///
713/// \code{.operation}
714/// FOR i := 0 TO 31
715/// j := i*8
716/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
717/// ENDFOR
718/// \endcode
719///
720/// \headerfile <immintrin.h>
721///
722/// This intrinsic corresponds to the \c VPCMPGTB instruction.
723///
724/// \param __a
725/// A 256-bit integer vector containing one of the inputs.
726/// \param __b
727/// A 256-bit integer vector containing one of the inputs.
728/// \returns A 256-bit integer vector containing the result.
729static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
730_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
731{
732 /* This function always performs a signed comparison, but __v32qi is a char
733 which may be signed or unsigned, so use __v32qs. */
734 return (__m256i)((__v32qs)__a > (__v32qs)__b);
735}
736
737/// Compares corresponding signed elements in the 256-bit vectors of
738/// [16 x i16] in \a __a and \a __b for greater-than and returns the
739/// outcomes in the corresponding elements of the 256-bit result.
740///
741/// \code{.operation}
742/// FOR i := 0 TO 15
743/// j := i*16
744/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
745/// ENDFOR
746/// \endcode
747///
748/// \headerfile <immintrin.h>
749///
750/// This intrinsic corresponds to the \c VPCMPGTW instruction.
751///
752/// \param __a
753/// A 256-bit vector of [16 x i16] containing one of the inputs.
754/// \param __b
755/// A 256-bit vector of [16 x i16] containing one of the inputs.
756/// \returns A 256-bit vector of [16 x i16] containing the result.
757static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
758_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
759{
760 return (__m256i)((__v16hi)__a > (__v16hi)__b);
761}
762
763/// Compares corresponding signed elements in the 256-bit vectors of
764/// [8 x i32] in \a __a and \a __b for greater-than and returns the
765/// outcomes in the corresponding elements of the 256-bit result.
766///
767/// \code{.operation}
768/// FOR i := 0 TO 7
769/// j := i*32
770/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
771/// ENDFOR
772/// \endcode
773///
774/// \headerfile <immintrin.h>
775///
776/// This intrinsic corresponds to the \c VPCMPGTD instruction.
777///
778/// \param __a
779/// A 256-bit vector of [8 x i32] containing one of the inputs.
780/// \param __b
781/// A 256-bit vector of [8 x i32] containing one of the inputs.
782/// \returns A 256-bit vector of [8 x i32] containing the result.
783static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
784_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
785{
786 return (__m256i)((__v8si)__a > (__v8si)__b);
787}
788
789/// Compares corresponding signed elements in the 256-bit vectors of
790/// [4 x i64] in \a __a and \a __b for greater-than and returns the
791/// outcomes in the corresponding elements of the 256-bit result.
792///
793/// \code{.operation}
794/// FOR i := 0 TO 3
795/// j := i*64
796/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
797/// ENDFOR
798/// \endcode
799///
800/// \headerfile <immintrin.h>
801///
802/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
803///
804/// \param __a
805/// A 256-bit vector of [4 x i64] containing one of the inputs.
806/// \param __b
807/// A 256-bit vector of [4 x i64] containing one of the inputs.
808/// \returns A 256-bit vector of [4 x i64] containing the result.
809static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
810_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
811{
812 return (__m256i)((__v4di)__a > (__v4di)__b);
813}
814
815/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
816/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
817/// element of the [16 x i16] result (overflow is ignored). Sums from
818/// \a __a are returned in the lower 64 bits of each 128-bit half of the
819/// result; sums from \a __b are returned in the upper 64 bits of each
820/// 128-bit half of the result.
821///
822/// \code{.operation}
823/// FOR i := 0 TO 1
824/// j := i*128
825/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
826/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
827/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
828/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
829/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
830/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
831/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
832/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
833/// ENDFOR
834/// \endcode
835///
836/// \headerfile <immintrin.h>
837///
838/// This intrinsic corresponds to the \c VPHADDW instruction.
839///
840/// \param __a
841/// A 256-bit vector of [16 x i16] containing one of the source operands.
842/// \param __b
843/// A 256-bit vector of [16 x i16] containing one of the source operands.
844/// \returns A 256-bit vector of [16 x i16] containing the sums.
845static __inline__ __m256i __DEFAULT_FN_ATTRS256
846_mm256_hadd_epi16(__m256i __a, __m256i __b)
847{
848 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
849}
850
851/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
852/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
853/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
854/// are returned in the lower 64 bits of each 128-bit half of the result;
855/// sums from \a __b are returned in the upper 64 bits of each 128-bit half
856/// of the result.
857///
858/// \code{.operation}
859/// FOR i := 0 TO 1
860/// j := i*128
861/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
862/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
863/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
864/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
865/// ENDFOR
866/// \endcode
867///
868/// \headerfile <immintrin.h>
869///
870/// This intrinsic corresponds to the \c VPHADDD instruction.
871///
872/// \param __a
873/// A 256-bit vector of [8 x i32] containing one of the source operands.
874/// \param __b
875/// A 256-bit vector of [8 x i32] containing one of the source operands.
876/// \returns A 256-bit vector of [8 x i32] containing the sums.
877static __inline__ __m256i __DEFAULT_FN_ATTRS256
878_mm256_hadd_epi32(__m256i __a, __m256i __b)
879{
880 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
881}
882
883/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
884/// vectors of [16 x i16] using signed saturation and returns each sum in
885/// an element of the [16 x i16] result. Sums from \a __a are returned in
886/// the lower 64 bits of each 128-bit half of the result; sums from \a __b
887/// are returned in the upper 64 bits of each 128-bit half of the result.
888///
889/// \code{.operation}
890/// FOR i := 0 TO 1
891/// j := i*128
892/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
893/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
894/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
895/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
896/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
897/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
898/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
899/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
900/// ENDFOR
901/// \endcode
902///
903/// \headerfile <immintrin.h>
904///
905/// This intrinsic corresponds to the \c VPHADDSW instruction.
906///
907/// \param __a
908/// A 256-bit vector of [16 x i16] containing one of the source operands.
909/// \param __b
910/// A 256-bit vector of [16 x i16] containing one of the source operands.
911/// \returns A 256-bit vector of [16 x i16] containing the sums.
912static __inline__ __m256i __DEFAULT_FN_ATTRS256
913_mm256_hadds_epi16(__m256i __a, __m256i __b)
914{
915 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
916}
917
918/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
919/// vectors of [16 x i16] and returns the lower 16 bits of each difference
920/// in an element of the [16 x i16] result (overflow is ignored).
921/// Differences from \a __a are returned in the lower 64 bits of each
922/// 128-bit half of the result; differences from \a __b are returned in the
923/// upper 64 bits of each 128-bit half of the result.
924///
925/// \code{.operation}
926/// FOR i := 0 TO 1
927/// j := i*128
928/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
929/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
930/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
931/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
932/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
933/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
934/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
935/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
936/// ENDFOR
937/// \endcode
938///
939/// \headerfile <immintrin.h>
940///
941/// This intrinsic corresponds to the \c VPHSUBW instruction.
942///
943/// \param __a
944/// A 256-bit vector of [16 x i16] containing one of the source operands.
945/// \param __b
946/// A 256-bit vector of [16 x i16] containing one of the source operands.
947/// \returns A 256-bit vector of [16 x i16] containing the differences.
948static __inline__ __m256i __DEFAULT_FN_ATTRS256
949_mm256_hsub_epi16(__m256i __a, __m256i __b)
950{
951 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
952}
953
954/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
955/// vectors of [8 x i32] and returns the lower 32 bits of each difference in
956/// an element of the [8 x i32] result (overflow is ignored). Differences
957/// from \a __a are returned in the lower 64 bits of each 128-bit half of
958/// the result; differences from \a __b are returned in the upper 64 bits
959/// of each 128-bit half of the result.
960///
961/// \code{.operation}
962/// FOR i := 0 TO 1
963/// j := i*128
964/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
965/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
966/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
967/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
968/// ENDFOR
969/// \endcode
970///
971/// \headerfile <immintrin.h>
972///
973/// This intrinsic corresponds to the \c VPHSUBD instruction.
974///
975/// \param __a
976/// A 256-bit vector of [8 x i32] containing one of the source operands.
977/// \param __b
978/// A 256-bit vector of [8 x i32] containing one of the source operands.
979/// \returns A 256-bit vector of [8 x i32] containing the differences.
980static __inline__ __m256i __DEFAULT_FN_ATTRS256
981_mm256_hsub_epi32(__m256i __a, __m256i __b)
982{
983 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
984}
985
986/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
987/// vectors of [16 x i16] using signed saturation and returns each sum in
988/// an element of the [16 x i16] result. Differences from \a __a are
989/// returned in the lower 64 bits of each 128-bit half of the result;
990/// differences from \a __b are returned in the upper 64 bits of each
991/// 128-bit half of the result.
992///
993/// \code{.operation}
994/// FOR i := 0 TO 1
995/// j := i*128
996/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
997/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
998/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
999/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1000/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1001/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1002/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1003/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1004/// ENDFOR
1005/// \endcode
1006///
1007/// \headerfile <immintrin.h>
1008///
1009/// This intrinsic corresponds to the \c VPHSUBSW instruction.
1010///
1011/// \param __a
1012/// A 256-bit vector of [16 x i16] containing one of the source operands.
1013/// \param __b
1014/// A 256-bit vector of [16 x i16] containing one of the source operands.
1015/// \returns A 256-bit vector of [16 x i16] containing the differences.
1016static __inline__ __m256i __DEFAULT_FN_ATTRS256
1017_mm256_hsubs_epi16(__m256i __a, __m256i __b)
1018{
1019 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1020}
1021
1022/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1023/// with the corresponding signed byte from the 256-bit integer vector in
1024/// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1025/// pairs of those products using signed saturation to form 16-bit sums
1026/// returned as elements of the [16 x i16] result.
1027///
1028/// \code{.operation}
1029/// FOR i := 0 TO 15
1030/// j := i*16
1031/// temp1 := __a[j+7:j] * __b[j+7:j]
1032/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1033/// result[j+15:j] := SATURATE16(temp1 + temp2)
1034/// ENDFOR
1035/// \endcode
1036///
1037/// \headerfile <immintrin.h>
1038///
1039/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1040///
1041/// \param __a
1042/// A 256-bit vector containing one of the source operands.
1043/// \param __b
1044/// A 256-bit vector containing one of the source operands.
1045/// \returns A 256-bit vector of [16 x i16] containing the result.
1046static __inline__ __m256i __DEFAULT_FN_ATTRS256
1048{
1049 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1050}
1051
1052/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1053/// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1054/// those products to form 32-bit sums returned as elements of the
1055/// [8 x i32] result.
1056///
1057/// There is only one wraparound case: when all four of the 16-bit sources
1058/// are \c 0x8000, the result will be \c 0x80000000.
1059///
1060/// \code{.operation}
1061/// FOR i := 0 TO 7
1062/// j := i*32
1063/// temp1 := __a[j+15:j] * __b[j+15:j]
1064/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1065/// result[j+31:j] := temp1 + temp2
1066/// ENDFOR
1067/// \endcode
1068///
1069/// \headerfile <immintrin.h>
1070///
1071/// This intrinsic corresponds to the \c VPMADDWD instruction.
1072///
1073/// \param __a
1074/// A 256-bit vector of [16 x i16] containing one of the source operands.
1075/// \param __b
1076/// A 256-bit vector of [16 x i16] containing one of the source operands.
1077/// \returns A 256-bit vector of [8 x i32] containing the result.
1078static __inline__ __m256i __DEFAULT_FN_ATTRS256
1079_mm256_madd_epi16(__m256i __a, __m256i __b)
1080{
1081 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1082}
1083
1084/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1085/// in \a __a and \a __b and returns the larger of each pair in the
1086/// corresponding byte of the 256-bit result.
1087///
1088/// \headerfile <immintrin.h>
1089///
1090/// This intrinsic corresponds to the \c VPMAXSB instruction.
1091///
1092/// \param __a
1093/// A 256-bit integer vector.
1094/// \param __b
1095/// A 256-bit integer vector.
1096/// \returns A 256-bit integer vector containing the result.
1097static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1098_mm256_max_epi8(__m256i __a, __m256i __b) {
1099 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1100}
1101
1102/// Compares the corresponding signed 16-bit integers in the two 256-bit
1103/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1104/// each pair in the corresponding element of the 256-bit result.
1105///
1106/// \headerfile <immintrin.h>
1107///
1108/// This intrinsic corresponds to the \c VPMAXSW instruction.
1109///
1110/// \param __a
1111/// A 256-bit vector of [16 x i16].
1112/// \param __b
1113/// A 256-bit vector of [16 x i16].
1114/// \returns A 256-bit vector of [16 x i16] containing the result.
1115static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1116_mm256_max_epi16(__m256i __a, __m256i __b) {
1117 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1118}
1119
1120/// Compares the corresponding signed 32-bit integers in the two 256-bit
1121/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1122/// each pair in the corresponding element of the 256-bit result.
1123///
1124/// \headerfile <immintrin.h>
1125///
1126/// This intrinsic corresponds to the \c VPMAXSD instruction.
1127///
1128/// \param __a
1129/// A 256-bit vector of [8 x i32].
1130/// \param __b
1131/// A 256-bit vector of [8 x i32].
1132/// \returns A 256-bit vector of [8 x i32] containing the result.
1133static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1134_mm256_max_epi32(__m256i __a, __m256i __b) {
1135 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1136}
1137
1138/// Compares the corresponding unsigned bytes in the two 256-bit integer
1139/// vectors in \a __a and \a __b and returns the larger of each pair in
1140/// the corresponding byte of the 256-bit result.
1141///
1142/// \headerfile <immintrin.h>
1143///
1144/// This intrinsic corresponds to the \c VPMAXUB instruction.
1145///
1146/// \param __a
1147/// A 256-bit integer vector.
1148/// \param __b
1149/// A 256-bit integer vector.
1150/// \returns A 256-bit integer vector containing the result.
1151static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1152_mm256_max_epu8(__m256i __a, __m256i __b) {
1153 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1154}
1155
1156/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1157/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1158/// each pair in the corresponding element of the 256-bit result.
1159///
1160/// \headerfile <immintrin.h>
1161///
1162/// This intrinsic corresponds to the \c VPMAXUW instruction.
1163///
1164/// \param __a
1165/// A 256-bit vector of [16 x i16].
1166/// \param __b
1167/// A 256-bit vector of [16 x i16].
1168/// \returns A 256-bit vector of [16 x i16] containing the result.
1169static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1170_mm256_max_epu16(__m256i __a, __m256i __b) {
1171 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1172}
1173
1174/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1175/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1176/// each pair in the corresponding element of the 256-bit result.
1177///
1178/// \headerfile <immintrin.h>
1179///
1180/// This intrinsic corresponds to the \c VPMAXUD instruction.
1181///
1182/// \param __a
1183/// A 256-bit vector of [8 x i32].
1184/// \param __b
1185/// A 256-bit vector of [8 x i32].
1186/// \returns A 256-bit vector of [8 x i32] containing the result.
1187static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1188_mm256_max_epu32(__m256i __a, __m256i __b) {
1189 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1190}
1191
1192/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1193/// in \a __a and \a __b and returns the smaller of each pair in the
1194/// corresponding byte of the 256-bit result.
1195///
1196/// \headerfile <immintrin.h>
1197///
1198/// This intrinsic corresponds to the \c VPMINSB instruction.
1199///
1200/// \param __a
1201/// A 256-bit integer vector.
1202/// \param __b
1203/// A 256-bit integer vector.
1204/// \returns A 256-bit integer vector containing the result.
1205static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1206_mm256_min_epi8(__m256i __a, __m256i __b) {
1207 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1208}
1209
1210/// Compares the corresponding signed 16-bit integers in the two 256-bit
1211/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1212/// each pair in the corresponding element of the 256-bit result.
1213///
1214/// \headerfile <immintrin.h>
1215///
1216/// This intrinsic corresponds to the \c VPMINSW instruction.
1217///
1218/// \param __a
1219/// A 256-bit vector of [16 x i16].
1220/// \param __b
1221/// A 256-bit vector of [16 x i16].
1222/// \returns A 256-bit vector of [16 x i16] containing the result.
1223static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1224_mm256_min_epi16(__m256i __a, __m256i __b) {
1225 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1226}
1227
1228/// Compares the corresponding signed 32-bit integers in the two 256-bit
1229/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1230/// each pair in the corresponding element of the 256-bit result.
1231///
1232/// \headerfile <immintrin.h>
1233///
1234/// This intrinsic corresponds to the \c VPMINSD instruction.
1235///
1236/// \param __a
1237/// A 256-bit vector of [8 x i32].
1238/// \param __b
1239/// A 256-bit vector of [8 x i32].
1240/// \returns A 256-bit vector of [8 x i32] containing the result.
1241static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1242_mm256_min_epi32(__m256i __a, __m256i __b) {
1243 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1244}
1245
1246/// Compares the corresponding unsigned bytes in the two 256-bit integer
1247/// vectors in \a __a and \a __b and returns the smaller of each pair in
1248/// the corresponding byte of the 256-bit result.
1249///
1250/// \headerfile <immintrin.h>
1251///
1252/// This intrinsic corresponds to the \c VPMINUB instruction.
1253///
1254/// \param __a
1255/// A 256-bit integer vector.
1256/// \param __b
1257/// A 256-bit integer vector.
1258/// \returns A 256-bit integer vector containing the result.
1259static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1260_mm256_min_epu8(__m256i __a, __m256i __b) {
1261 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1262}
1263
1264/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1265/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1266/// each pair in the corresponding element of the 256-bit result.
1267///
1268/// \headerfile <immintrin.h>
1269///
1270/// This intrinsic corresponds to the \c VPMINUW instruction.
1271///
1272/// \param __a
1273/// A 256-bit vector of [16 x i16].
1274/// \param __b
1275/// A 256-bit vector of [16 x i16].
1276/// \returns A 256-bit vector of [16 x i16] containing the result.
1277static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1278_mm256_min_epu16(__m256i __a, __m256i __b) {
1279 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1280}
1281
1282/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1283/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1284/// each pair in the corresponding element of the 256-bit result.
1285///
1286/// \headerfile <immintrin.h>
1287///
1288/// This intrinsic corresponds to the \c VPMINUD instruction.
1289///
1290/// \param __a
1291/// A 256-bit vector of [8 x i32].
1292/// \param __b
1293/// A 256-bit vector of [8 x i32].
1294/// \returns A 256-bit vector of [8 x i32] containing the result.
1295static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1296_mm256_min_epu32(__m256i __a, __m256i __b) {
1297 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1298}
1299
1300/// Creates a 32-bit integer mask from the most significant bit of each byte
1301/// in the 256-bit integer vector in \a __a and returns the result.
1302///
1303/// \code{.operation}
1304/// FOR i := 0 TO 31
1305/// j := i*8
1306/// result[i] := __a[j+7]
1307/// ENDFOR
1308/// \endcode
1309///
1310/// \headerfile <immintrin.h>
1311///
1312/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1313///
1314/// \param __a
1315/// A 256-bit integer vector containing the source bytes.
1316/// \returns The 32-bit integer mask.
1317static __inline__ int __DEFAULT_FN_ATTRS256
1319{
1320 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1321}
1322
1323/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1324/// the 16-bit values in the corresponding elements of a 256-bit vector
1325/// of [16 x i16].
1326///
1327/// \code{.operation}
1328/// FOR i := 0 TO 15
1329/// j := i*8
1330/// k := i*16
1331/// result[k+15:k] := SignExtend(__V[j+7:j])
1332/// ENDFOR
1333/// \endcode
1334///
1335/// \headerfile <immintrin.h>
1336///
1337/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1338///
1339/// \param __V
1340/// A 128-bit integer vector containing the source bytes.
1341/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1342/// values.
1343static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1345 /* This function always performs a signed extension, but __v16qi is a char
1346 which may be signed or unsigned, so use __v16qs. */
1347 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1348}
1349
1350/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1351/// \a __V and returns the 32-bit values in the corresponding elements of a
1352/// 256-bit vector of [8 x i32].
1353///
1354/// \code{.operation}
1355/// FOR i := 0 TO 7
1356/// j := i*8
1357/// k := i*32
1358/// result[k+31:k] := SignExtend(__V[j+7:j])
1359/// ENDFOR
1360/// \endcode
1361///
1362/// \headerfile <immintrin.h>
1363///
1364/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1365///
1366/// \param __V
1367/// A 128-bit integer vector containing the source bytes.
1368/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1369/// values.
1370static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1372 /* This function always performs a signed extension, but __v16qi is a char
1373 which may be signed or unsigned, so use __v16qs. */
1374 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1375}
1376
1377/// Sign-extends the first four bytes from the 128-bit integer vector in
1378/// \a __V and returns the 64-bit values in the corresponding elements of a
1379/// 256-bit vector of [4 x i64].
1380///
1381/// \code{.operation}
1382/// result[63:0] := SignExtend(__V[7:0])
1383/// result[127:64] := SignExtend(__V[15:8])
1384/// result[191:128] := SignExtend(__V[23:16])
1385/// result[255:192] := SignExtend(__V[31:24])
1386/// \endcode
1387///
1388/// \headerfile <immintrin.h>
1389///
1390/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1391///
1392/// \param __V
1393/// A 128-bit integer vector containing the source bytes.
1394/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1395/// values.
1396static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1398 /* This function always performs a signed extension, but __v16qi is a char
1399 which may be signed or unsigned, so use __v16qs. */
1400 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1401}
1402
1403/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1404/// \a __V and returns the 32-bit values in the corresponding elements of a
1405/// 256-bit vector of [8 x i32].
1406///
1407/// \code{.operation}
1408/// FOR i := 0 TO 7
1409/// j := i*16
1410/// k := i*32
1411/// result[k+31:k] := SignExtend(__V[j+15:j])
1412/// ENDFOR
1413/// \endcode
1414///
1415/// \headerfile <immintrin.h>
1416///
1417/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1418///
1419/// \param __V
1420/// A 128-bit vector of [8 x i16] containing the source values.
1421/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1422/// values.
1423static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1425 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1426}
1427
1428/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1429/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1430/// elements of a 256-bit vector of [4 x i64].
1431///
1432/// \code{.operation}
1433/// result[63:0] := SignExtend(__V[15:0])
1434/// result[127:64] := SignExtend(__V[31:16])
1435/// result[191:128] := SignExtend(__V[47:32])
1436/// result[255:192] := SignExtend(__V[64:48])
1437/// \endcode
1438///
1439/// \headerfile <immintrin.h>
1440///
1441/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1442///
1443/// \param __V
1444/// A 128-bit vector of [8 x i16] containing the source values.
1445/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1446/// values.
1447static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1449 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1450}
1451
1452/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1453/// \a __V and returns the 64-bit values in the corresponding elements of a
1454/// 256-bit vector of [4 x i64].
1455///
1456/// \code{.operation}
1457/// result[63:0] := SignExtend(__V[31:0])
1458/// result[127:64] := SignExtend(__V[63:32])
1459/// result[191:128] := SignExtend(__V[95:64])
1460/// result[255:192] := SignExtend(__V[127:96])
1461/// \endcode
1462///
1463/// \headerfile <immintrin.h>
1464///
1465/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1466///
1467/// \param __V
1468/// A 128-bit vector of [4 x i32] containing the source values.
1469/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1470/// values.
1471static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1473 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1474}
1475
1476/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1477/// the 16-bit values in the corresponding elements of a 256-bit vector
1478/// of [16 x i16].
1479///
1480/// \code{.operation}
1481/// FOR i := 0 TO 15
1482/// j := i*8
1483/// k := i*16
1484/// result[k+15:k] := ZeroExtend(__V[j+7:j])
1485/// ENDFOR
1486/// \endcode
1487///
1488/// \headerfile <immintrin.h>
1489///
1490/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1491///
1492/// \param __V
1493/// A 128-bit integer vector containing the source bytes.
1494/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1495/// values.
1496static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1498 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1499}
1500
1501/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1502/// \a __V and returns the 32-bit values in the corresponding elements of a
1503/// 256-bit vector of [8 x i32].
1504///
1505/// \code{.operation}
1506/// FOR i := 0 TO 7
1507/// j := i*8
1508/// k := i*32
1509/// result[k+31:k] := ZeroExtend(__V[j+7:j])
1510/// ENDFOR
1511/// \endcode
1512///
1513/// \headerfile <immintrin.h>
1514///
1515/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1516///
1517/// \param __V
1518/// A 128-bit integer vector containing the source bytes.
1519/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1520/// values.
1521static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1523 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1524}
1525
1526/// Zero-extends the first four bytes from the 128-bit integer vector in
1527/// \a __V and returns the 64-bit values in the corresponding elements of a
1528/// 256-bit vector of [4 x i64].
1529///
1530/// \code{.operation}
1531/// result[63:0] := ZeroExtend(__V[7:0])
1532/// result[127:64] := ZeroExtend(__V[15:8])
1533/// result[191:128] := ZeroExtend(__V[23:16])
1534/// result[255:192] := ZeroExtend(__V[31:24])
1535/// \endcode
1536///
1537/// \headerfile <immintrin.h>
1538///
1539/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1540///
1541/// \param __V
1542/// A 128-bit integer vector containing the source bytes.
1543/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1544/// values.
1545static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1547 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1548}
1549
1550/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1551/// \a __V and returns the 32-bit values in the corresponding elements of a
1552/// 256-bit vector of [8 x i32].
1553///
1554/// \code{.operation}
1555/// FOR i := 0 TO 7
1556/// j := i*16
1557/// k := i*32
1558/// result[k+31:k] := ZeroExtend(__V[j+15:j])
1559/// ENDFOR
1560/// \endcode
1561///
1562/// \headerfile <immintrin.h>
1563///
1564/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1565///
1566/// \param __V
1567/// A 128-bit vector of [8 x i16] containing the source values.
1568/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1569/// values.
1570static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1572 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1573}
1574
1575/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1576/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1577/// elements of a 256-bit vector of [4 x i64].
1578///
1579/// \code{.operation}
1580/// result[63:0] := ZeroExtend(__V[15:0])
1581/// result[127:64] := ZeroExtend(__V[31:16])
1582/// result[191:128] := ZeroExtend(__V[47:32])
1583/// result[255:192] := ZeroExtend(__V[64:48])
1584/// \endcode
1585///
1586/// \headerfile <immintrin.h>
1587///
1588/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1589///
1590/// \param __V
1591/// A 128-bit vector of [8 x i16] containing the source values.
1592/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1593/// values.
1594static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1596 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1597}
1598
1599/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1600/// \a __V and returns the 64-bit values in the corresponding elements of a
1601/// 256-bit vector of [4 x i64].
1602///
1603/// \code{.operation}
1604/// result[63:0] := ZeroExtend(__V[31:0])
1605/// result[127:64] := ZeroExtend(__V[63:32])
1606/// result[191:128] := ZeroExtend(__V[95:64])
1607/// result[255:192] := ZeroExtend(__V[127:96])
1608/// \endcode
1609///
1610/// \headerfile <immintrin.h>
1611///
1612/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1613///
1614/// \param __V
1615/// A 128-bit vector of [4 x i32] containing the source values.
1616/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1617/// values.
1618static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1620 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1621}
1622
1623/// Multiplies signed 32-bit integers from even-numbered elements of two
1624/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1625/// [4 x i64] result.
1626///
1627/// \code{.operation}
1628/// result[63:0] := __a[31:0] * __b[31:0]
1629/// result[127:64] := __a[95:64] * __b[95:64]
1630/// result[191:128] := __a[159:128] * __b[159:128]
1631/// result[255:192] := __a[223:192] * __b[223:192]
1632/// \endcode
1633///
1634/// \headerfile <immintrin.h>
1635///
1636/// This intrinsic corresponds to the \c VPMULDQ instruction.
1637///
1638/// \param __a
1639/// A 256-bit vector of [8 x i32] containing one of the source operands.
1640/// \param __b
1641/// A 256-bit vector of [8 x i32] containing one of the source operands.
1642/// \returns A 256-bit vector of [4 x i64] containing the products.
1643static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1644_mm256_mul_epi32(__m256i __a, __m256i __b) {
1645 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1646}
1647
1648/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1649/// [16 x i16], truncates the 32-bit results to the most significant 18
1650/// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1651/// product in the [16 x i16] result.
1652///
1653/// \code{.operation}
1654/// FOR i := 0 TO 15
1655/// j := i*16
1656/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1657/// result[j+15:j] := temp[16:1]
1658/// \endcode
1659///
1660/// \headerfile <immintrin.h>
1661///
1662/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1663///
1664/// \param __a
1665/// A 256-bit vector of [16 x i16] containing one of the source operands.
1666/// \param __b
1667/// A 256-bit vector of [16 x i16] containing one of the source operands.
1668/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1669static __inline__ __m256i __DEFAULT_FN_ATTRS256
1670_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1671{
1672 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1673}
1674
1675/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1676/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1677/// [16 x i16] result.
1678///
1679/// \headerfile <immintrin.h>
1680///
1681/// This intrinsic corresponds to the \c VPMULHUW instruction.
1682///
1683/// \param __a
1684/// A 256-bit vector of [16 x i16] containing one of the source operands.
1685/// \param __b
1686/// A 256-bit vector of [16 x i16] containing one of the source operands.
1687/// \returns A 256-bit vector of [16 x i16] containing the products.
1688static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1689_mm256_mulhi_epu16(__m256i __a, __m256i __b)
1690{
1691 return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
1692}
1693
1694/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1695/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1696/// [16 x i16] result.
1697///
1698/// \headerfile <immintrin.h>
1699///
1700/// This intrinsic corresponds to the \c VPMULHW instruction.
1701///
1702/// \param __a
1703/// A 256-bit vector of [16 x i16] containing one of the source operands.
1704/// \param __b
1705/// A 256-bit vector of [16 x i16] containing one of the source operands.
1706/// \returns A 256-bit vector of [16 x i16] containing the products.
1707static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1708_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1709{
1710 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1711}
1712
1713/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1714/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1715/// [16 x i16] result.
1716///
1717/// \headerfile <immintrin.h>
1718///
1719/// This intrinsic corresponds to the \c VPMULLW instruction.
1720///
1721/// \param __a
1722/// A 256-bit vector of [16 x i16] containing one of the source operands.
1723/// \param __b
1724/// A 256-bit vector of [16 x i16] containing one of the source operands.
1725/// \returns A 256-bit vector of [16 x i16] containing the products.
1726static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1727_mm256_mullo_epi16(__m256i __a, __m256i __b)
1728{
1729 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1730}
1731
1732/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1733/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1734/// [8 x i32] result.
1735///
1736/// \headerfile <immintrin.h>
1737///
1738/// This intrinsic corresponds to the \c VPMULLD instruction.
1739///
1740/// \param __a
1741/// A 256-bit vector of [8 x i32] containing one of the source operands.
1742/// \param __b
1743/// A 256-bit vector of [8 x i32] containing one of the source operands.
1744/// \returns A 256-bit vector of [8 x i32] containing the products.
1745static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1746_mm256_mullo_epi32(__m256i __a, __m256i __b) {
1747 return (__m256i)((__v8su)__a * (__v8su)__b);
1748}
1749
1750/// Multiplies unsigned 32-bit integers from even-numered elements of two
1751/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1752/// [4 x i64] result.
1753///
1754/// \code{.operation}
1755/// result[63:0] := __a[31:0] * __b[31:0]
1756/// result[127:64] := __a[95:64] * __b[95:64]
1757/// result[191:128] := __a[159:128] * __b[159:128]
1758/// result[255:192] := __a[223:192] * __b[223:192]
1759/// \endcode
1760///
1761/// \headerfile <immintrin.h>
1762///
1763/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1764///
1765/// \param __a
1766/// A 256-bit vector of [8 x i32] containing one of the source operands.
1767/// \param __b
1768/// A 256-bit vector of [8 x i32] containing one of the source operands.
1769/// \returns A 256-bit vector of [4 x i64] containing the products.
1770static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1771_mm256_mul_epu32(__m256i __a, __m256i __b) {
1772 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1773}
1774
1775/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1776/// \a __b.
1777///
1778/// \headerfile <immintrin.h>
1779///
1780/// This intrinsic corresponds to the \c VPOR instruction.
1781///
1782/// \param __a
1783/// A 256-bit integer vector.
1784/// \param __b
1785/// A 256-bit integer vector.
1786/// \returns A 256-bit integer vector containing the result.
1787static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1788_mm256_or_si256(__m256i __a, __m256i __b)
1789{
1790 return (__m256i)((__v4du)__a | (__v4du)__b);
1791}
1792
1793/// Computes four sum of absolute difference (SAD) operations on sets of eight
1794/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1795/// \a __b.
1796///
1797/// One SAD result is computed for each set of eight bytes from \a __a and
1798/// eight bytes from \a __b. The zero-extended SAD value is returned in the
1799/// corresponding 64-bit element of the result.
1800///
1801/// A single SAD operation takes the differences between the corresponding
1802/// bytes of \a __a and \a __b, takes the absolute value of each difference,
1803/// and sums these eight values to form one 16-bit result. This operation
1804/// is repeated four times with successive sets of eight bytes.
1805///
1806/// \code{.operation}
1807/// FOR i := 0 TO 3
1808/// j := i*64
1809/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1810/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1811/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1812/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1813/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1814/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1815/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1816/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1817/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1818/// temp4 + temp5 + temp6 + temp7
1819/// result[j+63:j+16] := 0
1820/// ENDFOR
1821/// \endcode
1822///
1823/// \headerfile <immintrin.h>
1824///
1825/// This intrinsic corresponds to the \c VPSADBW instruction.
1826///
1827/// \param __a
1828/// A 256-bit integer vector.
1829/// \param __b
1830/// A 256-bit integer vector.
1831/// \returns A 256-bit integer vector containing the result.
1832static __inline__ __m256i __DEFAULT_FN_ATTRS256
1833_mm256_sad_epu8(__m256i __a, __m256i __b)
1834{
1835 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1836}
1837
1838/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1839/// to control information in the 256-bit integer vector \a __b, and
1840/// returns the 256-bit result. In effect there are two separate 128-bit
1841/// shuffles in the lower and upper halves.
1842///
1843/// \code{.operation}
1844/// FOR i := 0 TO 31
1845/// j := i*8
1846/// IF __b[j+7] == 1
1847/// result[j+7:j] := 0
1848/// ELSE
1849/// k := __b[j+3:j] * 8
1850/// IF i > 15
1851/// k := k + 128
1852/// FI
1853/// result[j+7:j] := __a[k+7:k]
1854/// FI
1855/// ENDFOR
1856/// \endcode
1857///
1858/// \headerfile <immintrin.h>
1859///
1860/// This intrinsic corresponds to the \c VPSHUFB instruction.
1861///
1862/// \param __a
1863/// A 256-bit integer vector containing source values.
1864/// \param __b
1865/// A 256-bit integer vector containing control information to determine
1866/// what goes into the corresponding byte of the result. If bit 7 of the
1867/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1868/// control byte specify the index (within the same 128-bit half) of \a __a
1869/// to copy to the result byte.
1870/// \returns A 256-bit integer vector containing the result.
1871static __inline__ __m256i __DEFAULT_FN_ATTRS256
1872_mm256_shuffle_epi8(__m256i __a, __m256i __b)
1873{
1874 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1875}
1876
1877/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1878/// according to control information in the integer literal \a imm, and
1879/// returns the 256-bit result. In effect there are two parallel 128-bit
1880/// shuffles in the lower and upper halves.
1881///
1882/// \code{.operation}
1883/// FOR i := 0 to 3
1884/// j := i*32
1885/// k := (imm >> i*2)[1:0] * 32
1886/// result[j+31:j] := a[k+31:k]
1887/// result[128+j+31:128+j] := a[128+k+31:128+k]
1888/// ENDFOR
1889/// \endcode
1890///
1891/// \headerfile <immintrin.h>
1892///
1893/// \code
1894/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1895/// \endcode
1896///
1897/// This intrinsic corresponds to the \c VPSHUFB instruction.
1898///
1899/// \param a
1900/// A 256-bit vector of [8 x i32] containing source values.
1901/// \param imm
1902/// An immediate 8-bit value specifying which elements to copy from \a a.
1903/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1904/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1905/// forth.
1906/// \returns A 256-bit vector of [8 x i32] containing the result.
1907#define _mm256_shuffle_epi32(a, imm) \
1908 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1909
1910/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1911/// according to control information in the integer literal \a imm, and
1912/// returns the 256-bit result. The upper 64 bits of each 128-bit half
1913/// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1914/// copied from \a a unchanged.
1915///
1916/// \code{.operation}
1917/// result[63:0] := a[63:0]
1918/// result[191:128] := a[191:128]
1919/// FOR i := 0 TO 3
1920/// j := i * 16 + 64
1921/// k := (imm >> i*2)[1:0] * 16 + 64
1922/// result[j+15:j] := a[k+15:k]
1923/// result[128+j+15:128+j] := a[128+k+15:128+k]
1924/// ENDFOR
1925/// \endcode
1926///
1927/// \headerfile <immintrin.h>
1928///
1929/// \code
1930/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1931/// \endcode
1932///
1933/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1934///
1935/// \param a
1936/// A 256-bit vector of [16 x i16] containing source values.
1937/// \param imm
1938/// An immediate 8-bit value specifying which elements to copy from \a a.
1939/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1940/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1941/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1942/// \returns A 256-bit vector of [16 x i16] containing the result.
1943#define _mm256_shufflehi_epi16(a, imm) \
1944 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1945
1946/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1947/// according to control information in the integer literal \a imm, and
1948/// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1949/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1950/// copied from \a a unchanged.
1951///
1952/// \code{.operation}
1953/// result[127:64] := a[127:64]
1954/// result[255:192] := a[255:192]
1955/// FOR i := 0 TO 3
1956/// j := i * 16
1957/// k := (imm >> i*2)[1:0] * 16
1958/// result[j+15:j] := a[k+15:k]
1959/// result[128+j+15:128+j] := a[128+k+15:128+k]
1960/// ENDFOR
1961/// \endcode
1962///
1963/// \headerfile <immintrin.h>
1964///
1965/// \code
1966/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1967/// \endcode
1968///
1969/// This intrinsic corresponds to the \c VPSHUFLW instruction.
1970///
1971/// \param a
1972/// A 256-bit vector of [16 x i16] to use as a source of data for the
1973/// result.
1974/// \param imm
1975/// An immediate 8-bit value specifying which elements to copy from \a a.
1976/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
1977/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
1978/// forth.
1979/// \returns A 256-bit vector of [16 x i16] containing the result.
1980#define _mm256_shufflelo_epi16(a, imm) \
1981 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
1982
1983/// Sets each byte of the result to the corresponding byte of the 256-bit
1984/// integer vector in \a __a, the negative of that byte, or zero, depending
1985/// on whether the corresponding byte of the 256-bit integer vector in
1986/// \a __b is greater than zero, less than zero, or equal to zero,
1987/// respectively.
1988///
1989/// \headerfile <immintrin.h>
1990///
1991/// This intrinsic corresponds to the \c VPSIGNB instruction.
1992///
1993/// \param __a
1994/// A 256-bit integer vector.
1995/// \param __b
1996/// A 256-bit integer vector].
1997/// \returns A 256-bit integer vector containing the result.
1998static __inline__ __m256i __DEFAULT_FN_ATTRS256
1999_mm256_sign_epi8(__m256i __a, __m256i __b)
2000{
2001 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2002}
2003
2004/// Sets each element of the result to the corresponding element of the
2005/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2006/// or zero, depending on whether the corresponding element of the 256-bit
2007/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2008/// equal to zero, respectively.
2009///
2010/// \headerfile <immintrin.h>
2011///
2012/// This intrinsic corresponds to the \c VPSIGNW instruction.
2013///
2014/// \param __a
2015/// A 256-bit vector of [16 x i16].
2016/// \param __b
2017/// A 256-bit vector of [16 x i16].
2018/// \returns A 256-bit vector of [16 x i16] containing the result.
2019static __inline__ __m256i __DEFAULT_FN_ATTRS256
2020_mm256_sign_epi16(__m256i __a, __m256i __b)
2021{
2022 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2023}
2024
2025/// Sets each element of the result to the corresponding element of the
2026/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2027/// zero, depending on whether the corresponding element of the 256-bit
2028/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2029/// equal to zero, respectively.
2030///
2031/// \headerfile <immintrin.h>
2032///
2033/// This intrinsic corresponds to the \c VPSIGND instruction.
2034///
2035/// \param __a
2036/// A 256-bit vector of [8 x i32].
2037/// \param __b
2038/// A 256-bit vector of [8 x i32].
2039/// \returns A 256-bit vector of [8 x i32] containing the result.
2040static __inline__ __m256i __DEFAULT_FN_ATTRS256
2041_mm256_sign_epi32(__m256i __a, __m256i __b)
2042{
2043 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2044}
2045
2046/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2047/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2048/// is greater than 15, the returned result is all zeroes.
2049///
2050/// \headerfile <immintrin.h>
2051///
2052/// \code
2053/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2054/// \endcode
2055///
2056/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2057///
2058/// \param a
2059/// A 256-bit integer vector to be shifted.
2060/// \param imm
2061/// An unsigned immediate value specifying the shift count (in bytes).
2062/// \returns A 256-bit integer vector containing the result.
2063#define _mm256_slli_si256(a, imm) \
2064 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2065 (int)(imm)))
2066
2067/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2068/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2069/// is greater than 15, the returned result is all zeroes.
2070///
2071/// \headerfile <immintrin.h>
2072///
2073/// \code
2074/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2075/// \endcode
2076///
2077/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2078///
2079/// \param a
2080/// A 256-bit integer vector to be shifted.
2081/// \param imm
2082/// An unsigned immediate value specifying the shift count (in bytes).
2083/// \returns A 256-bit integer vector containing the result.
2084#define _mm256_bslli_epi128(a, imm) \
2085 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2086 (int)(imm)))
2087
2088/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2089/// left by \a __count bits, shifting in zero bits, and returns the result.
2090/// If \a __count is greater than 15, the returned result is all zeroes.
2091///
2092/// \headerfile <immintrin.h>
2093///
2094/// This intrinsic corresponds to the \c VPSLLW instruction.
2095///
2096/// \param __a
2097/// A 256-bit vector of [16 x i16] to be shifted.
2098/// \param __count
2099/// An unsigned integer value specifying the shift count (in bits).
2100/// \returns A 256-bit vector of [16 x i16] containing the result.
2101static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2102_mm256_slli_epi16(__m256i __a, int __count) {
2103 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2104}
2105
2106/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2107/// left by the number of bits specified by the lower 64 bits of \a __count,
2108/// shifting in zero bits, and returns the result. If \a __count is greater
2109/// than 15, the returned result is all zeroes.
2110///
2111/// \headerfile <immintrin.h>
2112///
2113/// This intrinsic corresponds to the \c VPSLLW instruction.
2114///
2115/// \param __a
2116/// A 256-bit vector of [16 x i16] to be shifted.
2117/// \param __count
2118/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2119/// shift count (in bits). The upper element is ignored.
2120/// \returns A 256-bit vector of [16 x i16] containing the result.
2121static __inline__ __m256i __DEFAULT_FN_ATTRS256
2122_mm256_sll_epi16(__m256i __a, __m128i __count)
2123{
2124 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2125}
2126
2127/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2128/// left by \a __count bits, shifting in zero bits, and returns the result.
2129/// If \a __count is greater than 31, the returned result is all zeroes.
2130///
2131/// \headerfile <immintrin.h>
2132///
2133/// This intrinsic corresponds to the \c VPSLLD instruction.
2134///
2135/// \param __a
2136/// A 256-bit vector of [8 x i32] to be shifted.
2137/// \param __count
2138/// An unsigned integer value specifying the shift count (in bits).
2139/// \returns A 256-bit vector of [8 x i32] containing the result.
2140static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2141_mm256_slli_epi32(__m256i __a, int __count) {
2142 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2143}
2144
2145/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2146/// left by the number of bits given in the lower 64 bits of \a __count,
2147/// shifting in zero bits, and returns the result. If \a __count is greater
2148/// than 31, the returned result is all zeroes.
2149///
2150/// \headerfile <immintrin.h>
2151///
2152/// This intrinsic corresponds to the \c VPSLLD instruction.
2153///
2154/// \param __a
2155/// A 256-bit vector of [8 x i32] to be shifted.
2156/// \param __count
2157/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2158/// shift count (in bits). The upper element is ignored.
2159/// \returns A 256-bit vector of [8 x i32] containing the result.
2160static __inline__ __m256i __DEFAULT_FN_ATTRS256
2161_mm256_sll_epi32(__m256i __a, __m128i __count)
2162{
2163 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2164}
2165
2166/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2167/// left by \a __count bits, shifting in zero bits, and returns the result.
2168/// If \a __count is greater than 63, the returned result is all zeroes.
2169///
2170/// \headerfile <immintrin.h>
2171///
2172/// This intrinsic corresponds to the \c VPSLLQ instruction.
2173///
2174/// \param __a
2175/// A 256-bit vector of [4 x i64] to be shifted.
2176/// \param __count
2177/// An unsigned integer value specifying the shift count (in bits).
2178/// \returns A 256-bit vector of [4 x i64] containing the result.
2179static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2180_mm256_slli_epi64(__m256i __a, int __count) {
2181 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2182}
2183
2184/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2185/// left by the number of bits given in the lower 64 bits of \a __count,
2186/// shifting in zero bits, and returns the result. If \a __count is greater
2187/// than 63, the returned result is all zeroes.
2188///
2189/// \headerfile <immintrin.h>
2190///
2191/// This intrinsic corresponds to the \c VPSLLQ instruction.
2192///
2193/// \param __a
2194/// A 256-bit vector of [4 x i64] to be shifted.
2195/// \param __count
2196/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2197/// shift count (in bits). The upper element is ignored.
2198/// \returns A 256-bit vector of [4 x i64] containing the result.
2199static __inline__ __m256i __DEFAULT_FN_ATTRS256
2200_mm256_sll_epi64(__m256i __a, __m128i __count)
2201{
2202 return __builtin_ia32_psllq256((__v4di)__a, __count);
2203}
2204
2205/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2206/// right by \a __count bits, shifting in sign bits, and returns the result.
2207/// If \a __count is greater than 15, each element of the result is either
2208/// 0 or -1 according to the corresponding input sign bit.
2209///
2210/// \headerfile <immintrin.h>
2211///
2212/// This intrinsic corresponds to the \c VPSRAW instruction.
2213///
2214/// \param __a
2215/// A 256-bit vector of [16 x i16] to be shifted.
2216/// \param __count
2217/// An unsigned integer value specifying the shift count (in bits).
2218/// \returns A 256-bit vector of [16 x i16] containing the result.
2219static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2220_mm256_srai_epi16(__m256i __a, int __count) {
2221 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2222}
2223
2224/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2225/// right by the number of bits given in the lower 64 bits of \a __count,
2226/// shifting in sign bits, and returns the result. If \a __count is greater
2227/// than 15, each element of the result is either 0 or -1 according to the
2228/// corresponding input sign bit.
2229///
2230/// \headerfile <immintrin.h>
2231///
2232/// This intrinsic corresponds to the \c VPSRAW instruction.
2233///
2234/// \param __a
2235/// A 256-bit vector of [16 x i16] to be shifted.
2236/// \param __count
2237/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2238/// shift count (in bits). The upper element is ignored.
2239/// \returns A 256-bit vector of [16 x i16] containing the result.
2240static __inline__ __m256i __DEFAULT_FN_ATTRS256
2241_mm256_sra_epi16(__m256i __a, __m128i __count)
2242{
2243 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2244}
2245
2246/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2247/// right by \a __count bits, shifting in sign bits, and returns the result.
2248/// If \a __count is greater than 31, each element of the result is either
2249/// 0 or -1 according to the corresponding input sign bit.
2250///
2251/// \headerfile <immintrin.h>
2252///
2253/// This intrinsic corresponds to the \c VPSRAD instruction.
2254///
2255/// \param __a
2256/// A 256-bit vector of [8 x i32] to be shifted.
2257/// \param __count
2258/// An unsigned integer value specifying the shift count (in bits).
2259/// \returns A 256-bit vector of [8 x i32] containing the result.
2260static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2261_mm256_srai_epi32(__m256i __a, int __count) {
2262 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2263}
2264
2265/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2266/// right by the number of bits given in the lower 64 bits of \a __count,
2267/// shifting in sign bits, and returns the result. If \a __count is greater
2268/// than 31, each element of the result is either 0 or -1 according to the
2269/// corresponding input sign bit.
2270///
2271/// \headerfile <immintrin.h>
2272///
2273/// This intrinsic corresponds to the \c VPSRAD instruction.
2274///
2275/// \param __a
2276/// A 256-bit vector of [8 x i32] to be shifted.
2277/// \param __count
2278/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2279/// shift count (in bits). The upper element is ignored.
2280/// \returns A 256-bit vector of [8 x i32] containing the result.
2281static __inline__ __m256i __DEFAULT_FN_ATTRS256
2282_mm256_sra_epi32(__m256i __a, __m128i __count)
2283{
2284 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2285}
2286
2287/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2288/// \a imm bytes, shifting in zero bytes, and returns the result. If
2289/// \a imm is greater than 15, the returned result is all zeroes.
2290///
2291/// \headerfile <immintrin.h>
2292///
2293/// \code
2294/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2295/// \endcode
2296///
2297/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2298///
2299/// \param a
2300/// A 256-bit integer vector to be shifted.
2301/// \param imm
2302/// An unsigned immediate value specifying the shift count (in bytes).
2303/// \returns A 256-bit integer vector containing the result.
2304#define _mm256_srli_si256(a, imm) \
2305 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2306 (int)(imm)))
2307
2308/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2309/// \a imm bytes, shifting in zero bytes, and returns the result. If
2310/// \a imm is greater than 15, the returned result is all zeroes.
2311///
2312/// \headerfile <immintrin.h>
2313///
2314/// \code
2315/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2316/// \endcode
2317///
2318/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2319///
2320/// \param a
2321/// A 256-bit integer vector to be shifted.
2322/// \param imm
2323/// An unsigned immediate value specifying the shift count (in bytes).
2324/// \returns A 256-bit integer vector containing the result.
2325#define _mm256_bsrli_epi128(a, imm) \
2326 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2327 (int)(imm)))
2328
2329/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2330/// right by \a __count bits, shifting in zero bits, and returns the result.
2331/// If \a __count is greater than 15, the returned result is all zeroes.
2332///
2333/// \headerfile <immintrin.h>
2334///
2335/// This intrinsic corresponds to the \c VPSRLW instruction.
2336///
2337/// \param __a
2338/// A 256-bit vector of [16 x i16] to be shifted.
2339/// \param __count
2340/// An unsigned integer value specifying the shift count (in bits).
2341/// \returns A 256-bit vector of [16 x i16] containing the result.
2342static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2343_mm256_srli_epi16(__m256i __a, int __count) {
2344 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2345}
2346
2347/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2348/// right by the number of bits given in the lower 64 bits of \a __count,
2349/// shifting in zero bits, and returns the result. If \a __count is greater
2350/// than 15, the returned result is all zeroes.
2351///
2352/// \headerfile <immintrin.h>
2353///
2354/// This intrinsic corresponds to the \c VPSRLW instruction.
2355///
2356/// \param __a
2357/// A 256-bit vector of [16 x i16] to be shifted.
2358/// \param __count
2359/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2360/// shift count (in bits). The upper element is ignored.
2361/// \returns A 256-bit vector of [16 x i16] containing the result.
2362static __inline__ __m256i __DEFAULT_FN_ATTRS256
2363_mm256_srl_epi16(__m256i __a, __m128i __count)
2364{
2365 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2366}
2367
2368/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2369/// right by \a __count bits, shifting in zero bits, and returns the result.
2370/// If \a __count is greater than 31, the returned result is all zeroes.
2371///
2372/// \headerfile <immintrin.h>
2373///
2374/// This intrinsic corresponds to the \c VPSRLD instruction.
2375///
2376/// \param __a
2377/// A 256-bit vector of [8 x i32] to be shifted.
2378/// \param __count
2379/// An unsigned integer value specifying the shift count (in bits).
2380/// \returns A 256-bit vector of [8 x i32] containing the result.
2381static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2382_mm256_srli_epi32(__m256i __a, int __count) {
2383 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2384}
2385
2386/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2387/// right by the number of bits given in the lower 64 bits of \a __count,
2388/// shifting in zero bits, and returns the result. If \a __count is greater
2389/// than 31, the returned result is all zeroes.
2390///
2391/// \headerfile <immintrin.h>
2392///
2393/// This intrinsic corresponds to the \c VPSRLD instruction.
2394///
2395/// \param __a
2396/// A 256-bit vector of [8 x i32] to be shifted.
2397/// \param __count
2398/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2399/// shift count (in bits). The upper element is ignored.
2400/// \returns A 256-bit vector of [8 x i32] containing the result.
2401static __inline__ __m256i __DEFAULT_FN_ATTRS256
2402_mm256_srl_epi32(__m256i __a, __m128i __count)
2403{
2404 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2405}
2406
2407/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2408/// right by \a __count bits, shifting in zero bits, and returns the result.
2409/// If \a __count is greater than 63, the returned result is all zeroes.
2410///
2411/// \headerfile <immintrin.h>
2412///
2413/// This intrinsic corresponds to the \c VPSRLQ instruction.
2414///
2415/// \param __a
2416/// A 256-bit vector of [4 x i64] to be shifted.
2417/// \param __count
2418/// An unsigned integer value specifying the shift count (in bits).
2419/// \returns A 256-bit vector of [4 x i64] containing the result.
2420static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2421_mm256_srli_epi64(__m256i __a, int __count) {
2422 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2423}
2424
2425/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2426/// right by the number of bits given in the lower 64 bits of \a __count,
2427/// shifting in zero bits, and returns the result. If \a __count is greater
2428/// than 63, the returned result is all zeroes.
2429///
2430/// \headerfile <immintrin.h>
2431///
2432/// This intrinsic corresponds to the \c VPSRLQ instruction.
2433///
2434/// \param __a
2435/// A 256-bit vector of [4 x i64] to be shifted.
2436/// \param __count
2437/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2438/// shift count (in bits). The upper element is ignored.
2439/// \returns A 256-bit vector of [4 x i64] containing the result.
2440static __inline__ __m256i __DEFAULT_FN_ATTRS256
2441_mm256_srl_epi64(__m256i __a, __m128i __count)
2442{
2443 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2444}
2445
2446/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2447/// vectors. Returns the lower 8 bits of each difference in the
2448/// corresponding byte of the 256-bit integer vector result (overflow is
2449/// ignored).
2450///
2451/// \code{.operation}
2452/// FOR i := 0 TO 31
2453/// j := i*8
2454/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2455/// ENDFOR
2456/// \endcode
2457///
2458/// \headerfile <immintrin.h>
2459///
2460/// This intrinsic corresponds to the \c VPSUBB instruction.
2461///
2462/// \param __a
2463/// A 256-bit integer vector containing the minuends.
2464/// \param __b
2465/// A 256-bit integer vector containing the subtrahends.
2466/// \returns A 256-bit integer vector containing the differences.
2467static __inline__ __m256i __DEFAULT_FN_ATTRS256
2468_mm256_sub_epi8(__m256i __a, __m256i __b)
2469{
2470 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2471}
2472
2473/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2474/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2475/// the corresponding element of the [16 x i16] result (overflow is
2476/// ignored).
2477///
2478/// \code{.operation}
2479/// FOR i := 0 TO 15
2480/// j := i*16
2481/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2482/// ENDFOR
2483/// \endcode
2484///
2485/// \headerfile <immintrin.h>
2486///
2487/// This intrinsic corresponds to the \c VPSUBW instruction.
2488///
2489/// \param __a
2490/// A 256-bit vector of [16 x i16] containing the minuends.
2491/// \param __b
2492/// A 256-bit vector of [16 x i16] containing the subtrahends.
2493/// \returns A 256-bit vector of [16 x i16] containing the differences.
2494static __inline__ __m256i __DEFAULT_FN_ATTRS256
2495_mm256_sub_epi16(__m256i __a, __m256i __b)
2496{
2497 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2498}
2499
2500/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2501/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2502/// the corresponding element of the [8 x i32] result (overflow is ignored).
2503///
2504/// \code{.operation}
2505/// FOR i := 0 TO 7
2506/// j := i*32
2507/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2508/// ENDFOR
2509/// \endcode
2510///
2511/// \headerfile <immintrin.h>
2512///
2513/// This intrinsic corresponds to the \c VPSUBD instruction.
2514///
2515/// \param __a
2516/// A 256-bit vector of [8 x i32] containing the minuends.
2517/// \param __b
2518/// A 256-bit vector of [8 x i32] containing the subtrahends.
2519/// \returns A 256-bit vector of [8 x i32] containing the differences.
2520static __inline__ __m256i __DEFAULT_FN_ATTRS256
2521_mm256_sub_epi32(__m256i __a, __m256i __b)
2522{
2523 return (__m256i)((__v8su)__a - (__v8su)__b);
2524}
2525
2526/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2527/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2528/// the corresponding element of the [4 x i64] result (overflow is ignored).
2529///
2530/// \code{.operation}
2531/// FOR i := 0 TO 3
2532/// j := i*64
2533/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2534/// ENDFOR
2535/// \endcode
2536///
2537/// \headerfile <immintrin.h>
2538///
2539/// This intrinsic corresponds to the \c VPSUBQ instruction.
2540///
2541/// \param __a
2542/// A 256-bit vector of [4 x i64] containing the minuends.
2543/// \param __b
2544/// A 256-bit vector of [4 x i64] containing the subtrahends.
2545/// \returns A 256-bit vector of [4 x i64] containing the differences.
2546static __inline__ __m256i __DEFAULT_FN_ATTRS256
2547_mm256_sub_epi64(__m256i __a, __m256i __b)
2548{
2549 return (__m256i)((__v4du)__a - (__v4du)__b);
2550}
2551
2552/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2553/// vectors using signed saturation, and returns each differences in the
2554/// corresponding byte of the 256-bit integer vector result.
2555///
2556/// \code{.operation}
2557/// FOR i := 0 TO 31
2558/// j := i*8
2559/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2560/// ENDFOR
2561/// \endcode
2562///
2563/// \headerfile <immintrin.h>
2564///
2565/// This intrinsic corresponds to the \c VPSUBSB instruction.
2566///
2567/// \param __a
2568/// A 256-bit integer vector containing the minuends.
2569/// \param __b
2570/// A 256-bit integer vector containing the subtrahends.
2571/// \returns A 256-bit integer vector containing the differences.
2572static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2573_mm256_subs_epi8(__m256i __a, __m256i __b) {
2574 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2575}
2576
2577/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2578/// vectors of [16 x i16] using signed saturation, and returns each
2579/// difference in the corresponding element of the [16 x i16] result.
2580///
2581/// \code{.operation}
2582/// FOR i := 0 TO 15
2583/// j := i*16
2584/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2585/// ENDFOR
2586/// \endcode
2587///
2588/// \headerfile <immintrin.h>
2589///
2590/// This intrinsic corresponds to the \c VPSUBSW instruction.
2591///
2592/// \param __a
2593/// A 256-bit vector of [16 x i16] containing the minuends.
2594/// \param __b
2595/// A 256-bit vector of [16 x i16] containing the subtrahends.
2596/// \returns A 256-bit vector of [16 x i16] containing the differences.
2597static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2598_mm256_subs_epi16(__m256i __a, __m256i __b) {
2599 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2600}
2601
2602/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2603/// vectors using unsigned saturation, and returns each difference in the
2604/// corresponding byte of the 256-bit integer vector result. For each byte,
2605/// computes <c> result = __a - __b </c>.
2606///
2607/// \code{.operation}
2608/// FOR i := 0 TO 31
2609/// j := i*8
2610/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2611/// ENDFOR
2612/// \endcode
2613///
2614/// \headerfile <immintrin.h>
2615///
2616/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2617///
2618/// \param __a
2619/// A 256-bit integer vector containing the minuends.
2620/// \param __b
2621/// A 256-bit integer vector containing the subtrahends.
2622/// \returns A 256-bit integer vector containing the differences.
2623static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2624_mm256_subs_epu8(__m256i __a, __m256i __b) {
2625 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2626}
2627
2628/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2629/// vectors of [16 x i16] using unsigned saturation, and returns each
2630/// difference in the corresponding element of the [16 x i16] result.
2631///
2632/// \code{.operation}
2633/// FOR i := 0 TO 15
2634/// j := i*16
2635/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2636/// ENDFOR
2637/// \endcode
2638///
2639/// \headerfile <immintrin.h>
2640///
2641/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2642///
2643/// \param __a
2644/// A 256-bit vector of [16 x i16] containing the minuends.
2645/// \param __b
2646/// A 256-bit vector of [16 x i16] containing the subtrahends.
2647/// \returns A 256-bit vector of [16 x i16] containing the differences.
2648static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2649_mm256_subs_epu16(__m256i __a, __m256i __b) {
2650 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2651}
2652
2653/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2654/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2655/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2656/// input; other bits in these parameters are ignored.
2657///
2658/// \code{.operation}
2659/// result[7:0] := __a[71:64]
2660/// result[15:8] := __b[71:64]
2661/// result[23:16] := __a[79:72]
2662/// result[31:24] := __b[79:72]
2663/// . . .
2664/// result[127:120] := __b[127:120]
2665/// result[135:128] := __a[199:192]
2666/// . . .
2667/// result[255:248] := __b[255:248]
2668/// \endcode
2669///
2670/// \headerfile <immintrin.h>
2671///
2672/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2673///
2674/// \param __a
2675/// A 256-bit integer vector used as the source for the even-numbered bytes
2676/// of the result.
2677/// \param __b
2678/// A 256-bit integer vector used as the source for the odd-numbered bytes
2679/// of the result.
2680/// \returns A 256-bit integer vector containing the result.
2681static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2682_mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
2683 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2684}
2685
2686/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2687/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2688/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2689/// 128-bit half of \a __a and \a __b as input; other bits in these
2690/// parameters are ignored.
2691///
2692/// \code{.operation}
2693/// result[15:0] := __a[79:64]
2694/// result[31:16] := __b[79:64]
2695/// result[47:32] := __a[95:80]
2696/// result[63:48] := __b[95:80]
2697/// . . .
2698/// result[127:112] := __b[127:112]
2699/// result[143:128] := __a[211:196]
2700/// . . .
2701/// result[255:240] := __b[255:240]
2702/// \endcode
2703///
2704/// \headerfile <immintrin.h>
2705///
2706/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2707///
2708/// \param __a
2709/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2710/// elements of the result.
2711/// \param __b
2712/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2713/// elements of the result.
2714/// \returns A 256-bit vector of [16 x i16] containing the result.
2715static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2716_mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
2717 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2718}
2719
2720/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2721/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2722/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2723/// of \a __a and \a __b as input; other bits in these parameters are
2724/// ignored.
2725///
2726/// \code{.operation}
2727/// result[31:0] := __a[95:64]
2728/// result[63:32] := __b[95:64]
2729/// result[95:64] := __a[127:96]
2730/// result[127:96] := __b[127:96]
2731/// result[159:128] := __a[223:192]
2732/// result[191:160] := __b[223:192]
2733/// result[223:192] := __a[255:224]
2734/// result[255:224] := __b[255:224]
2735/// \endcode
2736///
2737/// \headerfile <immintrin.h>
2738///
2739/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2740///
2741/// \param __a
2742/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2743/// elements of the result.
2744/// \param __b
2745/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2746/// elements of the result.
2747/// \returns A 256-bit vector of [8 x i32] containing the result.
2748static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2749_mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
2750 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2751}
2752
2753/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2754/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2755/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2756/// of \a __a and \a __b as input; other bits in these parameters are
2757/// ignored.
2758///
2759/// \code{.operation}
2760/// result[63:0] := __a[127:64]
2761/// result[127:64] := __b[127:64]
2762/// result[191:128] := __a[255:192]
2763/// result[255:192] := __b[255:192]
2764/// \endcode
2765///
2766/// \headerfile <immintrin.h>
2767///
2768/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2769///
2770/// \param __a
2771/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2772/// elements of the result.
2773/// \param __b
2774/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2775/// elements of the result.
2776/// \returns A 256-bit vector of [4 x i64] containing the result.
2777static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2778_mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
2779 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2780}
2781
2782/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2783/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2784/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2785/// input; other bits in these parameters are ignored.
2786///
2787/// \code{.operation}
2788/// result[7:0] := __a[7:0]
2789/// result[15:8] := __b[7:0]
2790/// result[23:16] := __a[15:8]
2791/// result[31:24] := __b[15:8]
2792/// . . .
2793/// result[127:120] := __b[63:56]
2794/// result[135:128] := __a[135:128]
2795/// . . .
2796/// result[255:248] := __b[191:184]
2797/// \endcode
2798///
2799/// \headerfile <immintrin.h>
2800///
2801/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2802///
2803/// \param __a
2804/// A 256-bit integer vector used as the source for the even-numbered bytes
2805/// of the result.
2806/// \param __b
2807/// A 256-bit integer vector used as the source for the odd-numbered bytes
2808/// of the result.
2809/// \returns A 256-bit integer vector containing the result.
2810static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2811_mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
2812 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2813}
2814
2815/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2816/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2817/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2818/// 128-bit half of \a __a and \a __b as input; other bits in these
2819/// parameters are ignored.
2820///
2821/// \code{.operation}
2822/// result[15:0] := __a[15:0]
2823/// result[31:16] := __b[15:0]
2824/// result[47:32] := __a[31:16]
2825/// result[63:48] := __b[31:16]
2826/// . . .
2827/// result[127:112] := __b[63:48]
2828/// result[143:128] := __a[143:128]
2829/// . . .
2830/// result[255:239] := __b[191:176]
2831/// \endcode
2832///
2833/// \headerfile <immintrin.h>
2834///
2835/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2836///
2837/// \param __a
2838/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2839/// elements of the result.
2840/// \param __b
2841/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2842/// elements of the result.
2843/// \returns A 256-bit vector of [16 x i16] containing the result.
2844static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2845_mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
2846 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2847}
2848
2849/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2850/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2851/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2852/// of \a __a and \a __b as input; other bits in these parameters are
2853/// ignored.
2854///
2855/// \code{.operation}
2856/// result[31:0] := __a[31:0]
2857/// result[63:32] := __b[31:0]
2858/// result[95:64] := __a[63:32]
2859/// result[127:96] := __b[63:32]
2860/// result[159:128] := __a[159:128]
2861/// result[191:160] := __b[159:128]
2862/// result[223:192] := __a[191:160]
2863/// result[255:224] := __b[191:190]
2864/// \endcode
2865///
2866/// \headerfile <immintrin.h>
2867///
2868/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2869///
2870/// \param __a
2871/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2872/// elements of the result.
2873/// \param __b
2874/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2875/// elements of the result.
2876/// \returns A 256-bit vector of [8 x i32] containing the result.
2877static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2878_mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
2879 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2880}
2881
2882/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2883/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2884/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2885/// of \a __a and \a __b as input; other bits in these parameters are
2886/// ignored.
2887///
2888/// \code{.operation}
2889/// result[63:0] := __a[63:0]
2890/// result[127:64] := __b[63:0]
2891/// result[191:128] := __a[191:128]
2892/// result[255:192] := __b[191:128]
2893/// \endcode
2894///
2895/// \headerfile <immintrin.h>
2896///
2897/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2898///
2899/// \param __a
2900/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2901/// elements of the result.
2902/// \param __b
2903/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2904/// elements of the result.
2905/// \returns A 256-bit vector of [4 x i64] containing the result.
2906static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2907_mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
2908 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2909}
2910
2911/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2912/// \a __b.
2913///
2914/// \headerfile <immintrin.h>
2915///
2916/// This intrinsic corresponds to the \c VPXOR instruction.
2917///
2918/// \param __a
2919/// A 256-bit integer vector.
2920/// \param __b
2921/// A 256-bit integer vector.
2922/// \returns A 256-bit integer vector containing the result.
2923static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2924_mm256_xor_si256(__m256i __a, __m256i __b)
2925{
2926 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2927}
2928
2929/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2930/// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2931/// boundary.
2932///
2933/// \headerfile <immintrin.h>
2934///
2935/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2936///
2937/// \param __V
2938/// A pointer to the 32-byte aligned memory containing the vector to load.
2939/// \returns A 256-bit integer vector loaded from memory.
2940static __inline__ __m256i __DEFAULT_FN_ATTRS256
2942{
2943 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2944 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2945}
2946
2947/// Broadcasts the 32-bit floating-point value from the low element of the
2948/// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2949/// 128-bit vector of [4 x float].
2950///
2951/// \headerfile <immintrin.h>
2952///
2953/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2954///
2955/// \param __X
2956/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2957/// \returns A 128-bit vector of [4 x float] containing the result.
2958static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
2960 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
2961}
2962
2963/// Broadcasts the 64-bit floating-point value from the low element of the
2964/// 128-bit vector of [2 x double] in \a __a to both elements of the
2965/// result's 128-bit vector of [2 x double].
2966///
2967/// \headerfile <immintrin.h>
2968///
2969/// This intrinsic corresponds to the \c MOVDDUP instruction.
2970///
2971/// \param __a
2972/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2973/// \returns A 128-bit vector of [2 x double] containing the result.
2974static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
2976 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
2977}
2978
2979/// Broadcasts the 32-bit floating-point value from the low element of the
2980/// 128-bit vector of [4 x float] in \a __X to all elements of the
2981/// result's 256-bit vector of [8 x float].
2982///
2983/// \headerfile <immintrin.h>
2984///
2985/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2986///
2987/// \param __X
2988/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2989/// \returns A 256-bit vector of [8 x float] containing the result.
2990static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
2992 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2993}
2994
2995/// Broadcasts the 64-bit floating-point value from the low element of the
2996/// 128-bit vector of [2 x double] in \a __X to all elements of the
2997/// result's 256-bit vector of [4 x double].
2998///
2999/// \headerfile <immintrin.h>
3000///
3001/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3002///
3003/// \param __X
3004/// A 128-bit vector of [2 x double] whose low element will be broadcast.
3005/// \returns A 256-bit vector of [4 x double] containing the result.
3006static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
3008 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3009}
3010
3011/// Broadcasts the 128-bit integer data from \a __X to both the lower and
3012/// upper halves of the 256-bit result.
3013///
3014/// \headerfile <immintrin.h>
3015///
3016/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3017///
3018/// \param __X
3019/// A 128-bit integer vector to be broadcast.
3020/// \returns A 256-bit integer vector containing the result.
3021static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3023 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3024}
3025
3026#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3027
3028/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3029/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3030/// as specified by the immediate integer operand \a M.
3031///
3032/// \code{.operation}
3033/// FOR i := 0 TO 3
3034/// j := i*32
3035/// IF M[i] == 0
3036/// result[31+j:j] := V1[31+j:j]
3037/// ELSE
3038/// result[31+j:j] := V2[32+j:j]
3039/// FI
3040/// ENDFOR
3041/// \endcode
3042///
3043/// \headerfile <immintrin.h>
3044///
3045/// \code
3046/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3047/// \endcode
3048///
3049/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3050///
3051/// \param V1
3052/// A 128-bit vector of [4 x i32] containing source values.
3053/// \param V2
3054/// A 128-bit vector of [4 x i32] containing source values.
3055/// \param M
3056/// An immediate 8-bit integer operand, with bits [3:0] specifying the
3057/// source for each element of the result. The position of the mask bit
3058/// corresponds to the index of a copied value. When a mask bit is 0, the
3059/// element is copied from \a V1; otherwise, it is copied from \a V2.
3060/// \returns A 128-bit vector of [4 x i32] containing the result.
3061#define _mm_blend_epi32(V1, V2, M) \
3062 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3063 (__v4si)(__m128i)(V2), (int)(M)))
3064
3065/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3066/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3067/// as specified by the immediate integer operand \a M.
3068///
3069/// \code{.operation}
3070/// FOR i := 0 TO 7
3071/// j := i*32
3072/// IF M[i] == 0
3073/// result[31+j:j] := V1[31+j:j]
3074/// ELSE
3075/// result[31+j:j] := V2[32+j:j]
3076/// FI
3077/// ENDFOR
3078/// \endcode
3079///
3080/// \headerfile <immintrin.h>
3081///
3082/// \code
3083/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3084/// \endcode
3085///
3086/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3087///
3088/// \param V1
3089/// A 256-bit vector of [8 x i32] containing source values.
3090/// \param V2
3091/// A 256-bit vector of [8 x i32] containing source values.
3092/// \param M
3093/// An immediate 8-bit integer operand, with bits [7:0] specifying the
3094/// source for each element of the result. The position of the mask bit
3095/// corresponds to the index of a copied value. When a mask bit is 0, the
3096/// element is copied from \a V1; otherwise, it is is copied from \a V2.
3097/// \returns A 256-bit vector of [8 x i32] containing the result.
3098#define _mm256_blend_epi32(V1, V2, M) \
3099 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3100 (__v8si)(__m256i)(V2), (int)(M)))
3101
3102/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3103/// bytes of the 256-bit result.
3104///
3105/// \headerfile <immintrin.h>
3106///
3107/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3108///
3109/// \param __X
3110/// A 128-bit integer vector whose low byte will be broadcast.
3111/// \returns A 256-bit integer vector containing the result.
3112static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3114 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3115}
3116
3117/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3118/// to all elements of the result's 256-bit vector of [16 x i16].
3119///
3120/// \headerfile <immintrin.h>
3121///
3122/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3123///
3124/// \param __X
3125/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3126/// \returns A 256-bit vector of [16 x i16] containing the result.
3127static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3129 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3130}
3131
3132/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3133/// to all elements of the result's 256-bit vector of [8 x i32].
3134///
3135/// \headerfile <immintrin.h>
3136///
3137/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3138///
3139/// \param __X
3140/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3141/// \returns A 256-bit vector of [8 x i32] containing the result.
3142static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3144 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3145}
3146
3147/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3148/// to all elements of the result's 256-bit vector of [4 x i64].
3149///
3150/// \headerfile <immintrin.h>
3151///
3152/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3153///
3154/// \param __X
3155/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3156/// \returns A 256-bit vector of [4 x i64] containing the result.
3157static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3159 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3160}
3161
3162/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3163/// bytes of the 128-bit result.
3164///
3165/// \headerfile <immintrin.h>
3166///
3167/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3168///
3169/// \param __X
3170/// A 128-bit integer vector whose low byte will be broadcast.
3171/// \returns A 128-bit integer vector containing the result.
3172static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3174 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3175}
3176
3177/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3178/// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3179///
3180/// \headerfile <immintrin.h>
3181///
3182/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3183///
3184/// \param __X
3185/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3186/// \returns A 128-bit vector of [8 x i16] containing the result.
3187static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3189 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3190}
3191
3192/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3193/// to all elements of the result's vector of [4 x i32].
3194///
3195/// \headerfile <immintrin.h>
3196///
3197/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3198///
3199/// \param __X
3200/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3201/// \returns A 128-bit vector of [4 x i32] containing the result.
3202static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3204 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3205}
3206
3207/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3208/// to both elements of the result's 128-bit vector of [2 x i64].
3209///
3210/// \headerfile <immintrin.h>
3211///
3212/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3213///
3214/// \param __X
3215/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3216/// \returns A 128-bit vector of [2 x i64] containing the result.
3217static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3219 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3220}
3221
3222/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3223/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3224/// elements of the 256-bit vector of [8 x i32] in \a __b.
3225///
3226/// \code{.operation}
3227/// FOR i := 0 TO 7
3228/// j := i*32
3229/// k := __b[j+2:j] * 32
3230/// result[j+31:j] := __a[k+31:k]
3231/// ENDFOR
3232/// \endcode
3233///
3234/// \headerfile <immintrin.h>
3235///
3236/// This intrinsic corresponds to the \c VPERMD instruction.
3237///
3238/// \param __a
3239/// A 256-bit vector of [8 x i32] containing the source values.
3240/// \param __b
3241/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3242/// \a __a.
3243/// \returns A 256-bit vector of [8 x i32] containing the result.
3244static __inline__ __m256i __DEFAULT_FN_ATTRS256
3246{
3247 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3248}
3249
3250/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3251/// the 256-bit vector of [4 x double] in \a V as specified by the
3252/// immediate value \a M.
3253///
3254/// \code{.operation}
3255/// FOR i := 0 TO 3
3256/// j := i*64
3257/// k := (M >> i*2)[1:0] * 64
3258/// result[j+63:j] := V[k+63:k]
3259/// ENDFOR
3260/// \endcode
3261///
3262/// \headerfile <immintrin.h>
3263///
3264/// \code
3265/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3266/// \endcode
3267///
3268/// This intrinsic corresponds to the \c VPERMPD instruction.
3269///
3270/// \param V
3271/// A 256-bit vector of [4 x double] containing the source values.
3272/// \param M
3273/// An immediate 8-bit value specifying which elements to copy from \a V.
3274/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3275/// \a M[3:2] specifies the index for element 1, and so forth.
3276/// \returns A 256-bit vector of [4 x double] containing the result.
3277#define _mm256_permute4x64_pd(V, M) \
3278 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3279
3280/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3281/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3282/// the elements of the 256-bit vector of [8 x i32] in \a __b.
3283///
3284/// \code{.operation}
3285/// FOR i := 0 TO 7
3286/// j := i*32
3287/// k := __b[j+2:j] * 32
3288/// result[j+31:j] := __a[k+31:k]
3289/// ENDFOR
3290/// \endcode
3291///
3292/// \headerfile <immintrin.h>
3293///
3294/// This intrinsic corresponds to the \c VPERMPS instruction.
3295///
3296/// \param __a
3297/// A 256-bit vector of [8 x float] containing the source values.
3298/// \param __b
3299/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3300/// \a __a.
3301/// \returns A 256-bit vector of [8 x float] containing the result.
3302static __inline__ __m256 __DEFAULT_FN_ATTRS256
3304{
3305 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3306}
3307
3308/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3309/// of the 256-bit vector of [4 x i64] in \a V as specified by the
3310/// immediate value \a M.
3311///
3312/// \code{.operation}
3313/// FOR i := 0 TO 3
3314/// j := i*64
3315/// k := (M >> i*2)[1:0] * 64
3316/// result[j+63:j] := V[k+63:k]
3317/// ENDFOR
3318/// \endcode
3319///
3320/// \headerfile <immintrin.h>
3321///
3322/// \code
3323/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3324/// \endcode
3325///
3326/// This intrinsic corresponds to the \c VPERMQ instruction.
3327///
3328/// \param V
3329/// A 256-bit vector of [4 x i64] containing the source values.
3330/// \param M
3331/// An immediate 8-bit value specifying which elements to copy from \a V.
3332/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3333/// \a M[3:2] specifies the index for element 1, and so forth.
3334/// \returns A 256-bit vector of [4 x i64] containing the result.
3335#define _mm256_permute4x64_epi64(V, M) \
3336 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3337
3338/// Sets each half of the 256-bit result either to zero or to one of the
3339/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3340/// as specified by the immediate value \a M.
3341///
3342/// \code{.operation}
3343/// FOR i := 0 TO 1
3344/// j := i*128
3345/// k := M >> (i*4)
3346/// IF k[3] == 0
3347/// CASE (k[1:0]) OF
3348/// 0: result[127+j:j] := V1[127:0]
3349/// 1: result[127+j:j] := V1[255:128]
3350/// 2: result[127+j:j] := V2[127:0]
3351/// 3: result[127+j:j] := V2[255:128]
3352/// ESAC
3353/// ELSE
3354/// result[127+j:j] := 0
3355/// FI
3356/// ENDFOR
3357/// \endcode
3358///
3359/// \headerfile <immintrin.h>
3360///
3361/// \code
3362/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3363/// \endcode
3364///
3365/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3366///
3367/// \param V1
3368/// A 256-bit integer vector containing source values.
3369/// \param V2
3370/// A 256-bit integer vector containing source values.
3371/// \param M
3372/// An immediate value specifying how to form the result. Bits [3:0]
3373/// control the lower half of the result, bits [7:4] control the upper half.
3374/// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3375/// otherwise bits [1:0] determine the source as follows. \n
3376/// 0: the lower half of \a V1 \n
3377/// 1: the upper half of \a V1 \n
3378/// 2: the lower half of \a V2 \n
3379/// 3: the upper half of \a V2
3380/// \returns A 256-bit integer vector containing the result.
3381#define _mm256_permute2x128_si256(V1, V2, M) \
3382 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3383
3384/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3385/// of the immediate \a M is zero, extracts the lower half of the result;
3386/// otherwise, extracts the upper half.
3387///
3388/// \headerfile <immintrin.h>
3389///
3390/// \code
3391/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3392/// \endcode
3393///
3394/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3395///
3396/// \param V
3397/// A 256-bit integer vector containing the source values.
3398/// \param M
3399/// An immediate value specifying which half of \a V to extract.
3400/// \returns A 128-bit integer vector containing the result.
3401#define _mm256_extracti128_si256(V, M) \
3402 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3403
3404/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3405/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3406/// is zero, overwrites the lower half of the result; otherwise,
3407/// overwrites the upper half.
3408///
3409/// \headerfile <immintrin.h>
3410///
3411/// \code
3412/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3413/// \endcode
3414///
3415/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3416///
3417/// \param V1
3418/// A 256-bit integer vector containing a source value.
3419/// \param V2
3420/// A 128-bit integer vector containing a source value.
3421/// \param M
3422/// An immediate value specifying where to put \a V2 in the result.
3423/// \returns A 256-bit integer vector containing the result.
3424#define _mm256_inserti128_si256(V1, V2, M) \
3425 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3426 (__v2di)(__m128i)(V2), (int)(M)))
3427
3428/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3429/// the most significant bit of the corresponding element in the mask
3430/// \a __M is set; otherwise, sets that element of the result to zero.
3431/// Returns the 256-bit [8 x i32] result.
3432///
3433/// \code{.operation}
3434/// FOR i := 0 TO 7
3435/// j := i*32
3436/// IF __M[j+31] == 1
3437/// result[j+31:j] := Load32(__X+(i*4))
3438/// ELSE
3439/// result[j+31:j] := 0
3440/// FI
3441/// ENDFOR
3442/// \endcode
3443///
3444/// \headerfile <immintrin.h>
3445///
3446/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3447///
3448/// \param __X
3449/// A pointer to the memory used for loading values.
3450/// \param __M
3451/// A 256-bit vector of [8 x i32] containing the mask bits.
3452/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3453/// elements.
3454static __inline__ __m256i __DEFAULT_FN_ATTRS256
3455_mm256_maskload_epi32(int const *__X, __m256i __M)
3456{
3457 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3458}
3459
3460/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3461/// the most significant bit of the corresponding element in the mask
3462/// \a __M is set; otherwise, sets that element of the result to zero.
3463/// Returns the 256-bit [4 x i64] result.
3464///
3465/// \code{.operation}
3466/// FOR i := 0 TO 3
3467/// j := i*64
3468/// IF __M[j+63] == 1
3469/// result[j+63:j] := Load64(__X+(i*8))
3470/// ELSE
3471/// result[j+63:j] := 0
3472/// FI
3473/// ENDFOR
3474/// \endcode
3475///
3476/// \headerfile <immintrin.h>
3477///
3478/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3479///
3480/// \param __X
3481/// A pointer to the memory used for loading values.
3482/// \param __M
3483/// A 256-bit vector of [4 x i64] containing the mask bits.
3484/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3485/// elements.
3486static __inline__ __m256i __DEFAULT_FN_ATTRS256
3487_mm256_maskload_epi64(long long const *__X, __m256i __M)
3488{
3489 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3490}
3491
3492/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3493/// the most significant bit of the corresponding element in the mask
3494/// \a __M is set; otherwise, sets that element of the result to zero.
3495/// Returns the 128-bit [4 x i32] result.
3496///
3497/// \code{.operation}
3498/// FOR i := 0 TO 3
3499/// j := i*32
3500/// IF __M[j+31] == 1
3501/// result[j+31:j] := Load32(__X+(i*4))
3502/// ELSE
3503/// result[j+31:j] := 0
3504/// FI
3505/// ENDFOR
3506/// \endcode
3507///
3508/// \headerfile <immintrin.h>
3509///
3510/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3511///
3512/// \param __X
3513/// A pointer to the memory used for loading values.
3514/// \param __M
3515/// A 128-bit vector of [4 x i32] containing the mask bits.
3516/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3517/// elements.
3518static __inline__ __m128i __DEFAULT_FN_ATTRS128
3519_mm_maskload_epi32(int const *__X, __m128i __M)
3520{
3521 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3522}
3523
3524/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3525/// the most significant bit of the corresponding element in the mask
3526/// \a __M is set; otherwise, sets that element of the result to zero.
3527/// Returns the 128-bit [2 x i64] result.
3528///
3529/// \code{.operation}
3530/// FOR i := 0 TO 1
3531/// j := i*64
3532/// IF __M[j+63] == 1
3533/// result[j+63:j] := Load64(__X+(i*8))
3534/// ELSE
3535/// result[j+63:j] := 0
3536/// FI
3537/// ENDFOR
3538/// \endcode
3539///
3540/// \headerfile <immintrin.h>
3541///
3542/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3543///
3544/// \param __X
3545/// A pointer to the memory used for loading values.
3546/// \param __M
3547/// A 128-bit vector of [2 x i64] containing the mask bits.
3548/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3549/// elements.
3550static __inline__ __m128i __DEFAULT_FN_ATTRS128
3551_mm_maskload_epi64(long long const *__X, __m128i __M)
3552{
3553 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3554}
3555
3556/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3557/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3558/// the corresponding element in the mask \a __M is set; otherwise, the
3559/// memory element is unchanged.
3560///
3561/// \code{.operation}
3562/// FOR i := 0 TO 7
3563/// j := i*32
3564/// IF __M[j+31] == 1
3565/// Store32(__X+(i*4), __Y[j+31:j])
3566/// FI
3567/// ENDFOR
3568/// \endcode
3569///
3570/// \headerfile <immintrin.h>
3571///
3572/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3573///
3574/// \param __X
3575/// A pointer to the memory used for storing values.
3576/// \param __M
3577/// A 256-bit vector of [8 x i32] containing the mask bits.
3578/// \param __Y
3579/// A 256-bit vector of [8 x i32] containing the values to store.
3580static __inline__ void __DEFAULT_FN_ATTRS256
3581_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3582{
3583 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3584}
3585
3586/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3587/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3588/// the corresponding element in the mask \a __M is set; otherwise, the
3589/// memory element is unchanged.
3590///
3591/// \code{.operation}
3592/// FOR i := 0 TO 3
3593/// j := i*64
3594/// IF __M[j+63] == 1
3595/// Store64(__X+(i*8), __Y[j+63:j])
3596/// FI
3597/// ENDFOR
3598/// \endcode
3599///
3600/// \headerfile <immintrin.h>
3601///
3602/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3603///
3604/// \param __X
3605/// A pointer to the memory used for storing values.
3606/// \param __M
3607/// A 256-bit vector of [4 x i64] containing the mask bits.
3608/// \param __Y
3609/// A 256-bit vector of [4 x i64] containing the values to store.
3610static __inline__ void __DEFAULT_FN_ATTRS256
3611_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3612{
3613 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3614}
3615
3616/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3617/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3618/// the corresponding element in the mask \a __M is set; otherwise, the
3619/// memory element is unchanged.
3620///
3621/// \code{.operation}
3622/// FOR i := 0 TO 3
3623/// j := i*32
3624/// IF __M[j+31] == 1
3625/// Store32(__X+(i*4), __Y[j+31:j])
3626/// FI
3627/// ENDFOR
3628/// \endcode
3629///
3630/// \headerfile <immintrin.h>
3631///
3632/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3633///
3634/// \param __X
3635/// A pointer to the memory used for storing values.
3636/// \param __M
3637/// A 128-bit vector of [4 x i32] containing the mask bits.
3638/// \param __Y
3639/// A 128-bit vector of [4 x i32] containing the values to store.
3640static __inline__ void __DEFAULT_FN_ATTRS128
3641_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3642{
3643 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3644}
3645
3646/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3647/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3648/// the corresponding element in the mask \a __M is set; otherwise, the
3649/// memory element is unchanged.
3650///
3651/// \code{.operation}
3652/// FOR i := 0 TO 1
3653/// j := i*64
3654/// IF __M[j+63] == 1
3655/// Store64(__X+(i*8), __Y[j+63:j])
3656/// FI
3657/// ENDFOR
3658/// \endcode
3659///
3660/// \headerfile <immintrin.h>
3661///
3662/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3663///
3664/// \param __X
3665/// A pointer to the memory used for storing values.
3666/// \param __M
3667/// A 128-bit vector of [2 x i64] containing the mask bits.
3668/// \param __Y
3669/// A 128-bit vector of [2 x i64] containing the values to store.
3670static __inline__ void __DEFAULT_FN_ATTRS128
3671_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3672{
3673 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3674}
3675
3676/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3677/// left by the number of bits given in the corresponding element of the
3678/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3679/// returns the result. If the shift count for any element is greater than
3680/// 31, the result for that element is zero.
3681///
3682/// \headerfile <immintrin.h>
3683///
3684/// This intrinsic corresponds to the \c VPSLLVD instruction.
3685///
3686/// \param __X
3687/// A 256-bit vector of [8 x i32] to be shifted.
3688/// \param __Y
3689/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3690/// bits).
3691/// \returns A 256-bit vector of [8 x i32] containing the result.
3692static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3693_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3694{
3695 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3696}
3697
3698/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3699/// left by the number of bits given in the corresponding element of the
3700/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3701/// returns the result. If the shift count for any element is greater than
3702/// 31, the result for that element is zero.
3703///
3704/// \headerfile <immintrin.h>
3705///
3706/// This intrinsic corresponds to the \c VPSLLVD instruction.
3707///
3708/// \param __X
3709/// A 128-bit vector of [4 x i32] to be shifted.
3710/// \param __Y
3711/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3712/// bits).
3713/// \returns A 128-bit vector of [4 x i32] containing the result.
3714static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3715_mm_sllv_epi32(__m128i __X, __m128i __Y)
3716{
3717 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3718}
3719
3720/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3721/// left by the number of bits given in the corresponding element of the
3722/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3723/// returns the result. If the shift count for any element is greater than
3724/// 63, the result for that element is zero.
3725///
3726/// \headerfile <immintrin.h>
3727///
3728/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3729///
3730/// \param __X
3731/// A 256-bit vector of [4 x i64] to be shifted.
3732/// \param __Y
3733/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3734/// bits).
3735/// \returns A 256-bit vector of [4 x i64] containing the result.
3736static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3737_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3738{
3739 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3740}
3741
3742/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3743/// left by the number of bits given in the corresponding element of the
3744/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3745/// returns the result. If the shift count for any element is greater than
3746/// 63, the result for that element is zero.
3747///
3748/// \headerfile <immintrin.h>
3749///
3750/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3751///
3752/// \param __X
3753/// A 128-bit vector of [2 x i64] to be shifted.
3754/// \param __Y
3755/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3756/// bits).
3757/// \returns A 128-bit vector of [2 x i64] containing the result.
3758static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3759_mm_sllv_epi64(__m128i __X, __m128i __Y)
3760{
3761 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3762}
3763
3764/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3765/// right by the number of bits given in the corresponding element of the
3766/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3767/// returns the result. If the shift count for any element is greater than
3768/// 31, the result for that element is 0 or -1 according to the sign bit
3769/// for that element.
3770///
3771/// \headerfile <immintrin.h>
3772///
3773/// This intrinsic corresponds to the \c VPSRAVD instruction.
3774///
3775/// \param __X
3776/// A 256-bit vector of [8 x i32] to be shifted.
3777/// \param __Y
3778/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3779/// bits).
3780/// \returns A 256-bit vector of [8 x i32] containing the result.
3781static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3782_mm256_srav_epi32(__m256i __X, __m256i __Y)
3783{
3784 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3785}
3786
3787/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3788/// right by the number of bits given in the corresponding element of the
3789/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3790/// returns the result. If the shift count for any element is greater than
3791/// 31, the result for that element is 0 or -1 according to the sign bit
3792/// for that element.
3793///
3794/// \headerfile <immintrin.h>
3795///
3796/// This intrinsic corresponds to the \c VPSRAVD instruction.
3797///
3798/// \param __X
3799/// A 128-bit vector of [4 x i32] to be shifted.
3800/// \param __Y
3801/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3802/// bits).
3803/// \returns A 128-bit vector of [4 x i32] containing the result.
3804static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3805_mm_srav_epi32(__m128i __X, __m128i __Y)
3806{
3807 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3808}
3809
3810/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3811/// right by the number of bits given in the corresponding element of the
3812/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3813/// returns the result. If the shift count for any element is greater than
3814/// 31, the result for that element is zero.
3815///
3816/// \headerfile <immintrin.h>
3817///
3818/// This intrinsic corresponds to the \c VPSRLVD instruction.
3819///
3820/// \param __X
3821/// A 256-bit vector of [8 x i32] to be shifted.
3822/// \param __Y
3823/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3824/// bits).
3825/// \returns A 256-bit vector of [8 x i32] containing the result.
3826static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3827_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3828{
3829 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3830}
3831
3832/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3833/// right by the number of bits given in the corresponding element of the
3834/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3835/// returns the result. If the shift count for any element is greater than
3836/// 31, the result for that element is zero.
3837///
3838/// \headerfile <immintrin.h>
3839///
3840/// This intrinsic corresponds to the \c VPSRLVD instruction.
3841///
3842/// \param __X
3843/// A 128-bit vector of [4 x i32] to be shifted.
3844/// \param __Y
3845/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3846/// bits).
3847/// \returns A 128-bit vector of [4 x i32] containing the result.
3848static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3849_mm_srlv_epi32(__m128i __X, __m128i __Y)
3850{
3851 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3852}
3853
3854/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3855/// right by the number of bits given in the corresponding element of the
3856/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3857/// returns the result. If the shift count for any element is greater than
3858/// 63, the result for that element is zero.
3859///
3860/// \headerfile <immintrin.h>
3861///
3862/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3863///
3864/// \param __X
3865/// A 256-bit vector of [4 x i64] to be shifted.
3866/// \param __Y
3867/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3868/// bits).
3869/// \returns A 256-bit vector of [4 x i64] containing the result.
3870static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3871_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3872{
3873 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3874}
3875
3876/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3877/// right by the number of bits given in the corresponding element of the
3878/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3879/// returns the result. If the shift count for any element is greater than
3880/// 63, the result for that element is zero.
3881///
3882/// \headerfile <immintrin.h>
3883///
3884/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3885///
3886/// \param __X
3887/// A 128-bit vector of [2 x i64] to be shifted.
3888/// \param __Y
3889/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3890/// bits).
3891/// \returns A 128-bit vector of [2 x i64] containing the result.
3892static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3893_mm_srlv_epi64(__m128i __X, __m128i __Y)
3894{
3895 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3896}
3897
3898/// Conditionally gathers two 64-bit floating-point values, either from the
3899/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3900/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3901/// of [2 x double] in \a mask determines the source for each element.
3902///
3903/// \code{.operation}
3904/// FOR element := 0 to 1
3905/// j := element*64
3906/// k := element*32
3907/// IF mask[j+63] == 0
3908/// result[j+63:j] := a[j+63:j]
3909/// ELSE
3910/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3911/// FI
3912/// ENDFOR
3913/// \endcode
3914///
3915/// \headerfile <immintrin.h>
3916///
3917/// \code
3918/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3919/// __m128d mask, const int s);
3920/// \endcode
3921///
3922/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3923///
3924/// \param a
3925/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3926/// zero.
3927/// \param m
3928/// A pointer to the memory used for loading values.
3929/// \param i
3930/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3931/// the first two elements are used.
3932/// \param mask
3933/// A 128-bit vector of [2 x double] containing the mask. The most
3934/// significant bit of each element in the mask vector represents the mask
3935/// bits. If a mask bit is zero, the corresponding value from vector \a a
3936/// is gathered; otherwise the value is loaded from memory.
3937/// \param s
3938/// A literal constant scale factor for the indexes in \a i. Must be
3939/// 1, 2, 4, or 8.
3940/// \returns A 128-bit vector of [2 x double] containing the gathered values.
3941#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
3942 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
3943 (double const *)(m), \
3944 (__v4si)(__m128i)(i), \
3945 (__v2df)(__m128d)(mask), (s)))
3946
3947/// Conditionally gathers four 64-bit floating-point values, either from the
3948/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
3949/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
3950/// of [4 x double] in \a mask determines the source for each element.
3951///
3952/// \code{.operation}
3953/// FOR element := 0 to 3
3954/// j := element*64
3955/// k := element*32
3956/// IF mask[j+63] == 0
3957/// result[j+63:j] := a[j+63:j]
3958/// ELSE
3959/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3960/// FI
3961/// ENDFOR
3962/// \endcode
3963///
3964/// \headerfile <immintrin.h>
3965///
3966/// \code
3967/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
3968/// __m256d mask, const int s);
3969/// \endcode
3970///
3971/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3972///
3973/// \param a
3974/// A 256-bit vector of [4 x double] used as the source when a mask bit is
3975/// zero.
3976/// \param m
3977/// A pointer to the memory used for loading values.
3978/// \param i
3979/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3980/// \param mask
3981/// A 256-bit vector of [4 x double] containing the mask. The most
3982/// significant bit of each element in the mask vector represents the mask
3983/// bits. If a mask bit is zero, the corresponding value from vector \a a
3984/// is gathered; otherwise the value is loaded from memory.
3985/// \param s
3986/// A literal constant scale factor for the indexes in \a i. Must be
3987/// 1, 2, 4, or 8.
3988/// \returns A 256-bit vector of [4 x double] containing the gathered values.
3989#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
3990 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
3991 (double const *)(m), \
3992 (__v4si)(__m128i)(i), \
3993 (__v4df)(__m256d)(mask), (s)))
3994
3995/// Conditionally gathers two 64-bit floating-point values, either from the
3996/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3997/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3998/// of [2 x double] in \a mask determines the source for each element.
3999///
4000/// \code{.operation}
4001/// FOR element := 0 to 1
4002/// j := element*64
4003/// k := element*64
4004/// IF mask[j+63] == 0
4005/// result[j+63:j] := a[j+63:j]
4006/// ELSE
4007/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4008/// FI
4009/// ENDFOR
4010/// \endcode
4011///
4012/// \headerfile <immintrin.h>
4013///
4014/// \code
4015/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4016/// __m128d mask, const int s);
4017/// \endcode
4018///
4019/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4020///
4021/// \param a
4022/// A 128-bit vector of [2 x double] used as the source when a mask bit is
4023/// zero.
4024/// \param m
4025/// A pointer to the memory used for loading values.
4026/// \param i
4027/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4028/// \param mask
4029/// A 128-bit vector of [2 x double] containing the mask. The most
4030/// significant bit of each element in the mask vector represents the mask
4031/// bits. If a mask bit is zero, the corresponding value from vector \a a
4032/// is gathered; otherwise the value is loaded from memory.
4033/// \param s
4034/// A literal constant scale factor for the indexes in \a i. Must be
4035/// 1, 2, 4, or 8.
4036/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4037#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4038 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4039 (double const *)(m), \
4040 (__v2di)(__m128i)(i), \
4041 (__v2df)(__m128d)(mask), (s)))
4042
4043/// Conditionally gathers four 64-bit floating-point values, either from the
4044/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4045/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4046/// of [4 x double] in \a mask determines the source for each element.
4047///
4048/// \code{.operation}
4049/// FOR element := 0 to 3
4050/// j := element*64
4051/// k := element*64
4052/// IF mask[j+63] == 0
4053/// result[j+63:j] := a[j+63:j]
4054/// ELSE
4055/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4056/// FI
4057/// ENDFOR
4058/// \endcode
4059///
4060/// \headerfile <immintrin.h>
4061///
4062/// \code
4063/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4064/// __m256d mask, const int s);
4065/// \endcode
4066///
4067/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4068///
4069/// \param a
4070/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4071/// zero.
4072/// \param m
4073/// A pointer to the memory used for loading values.
4074/// \param i
4075/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4076/// \param mask
4077/// A 256-bit vector of [4 x double] containing the mask. The most
4078/// significant bit of each element in the mask vector represents the mask
4079/// bits. If a mask bit is zero, the corresponding value from vector \a a
4080/// is gathered; otherwise the value is loaded from memory.
4081/// \param s
4082/// A literal constant scale factor for the indexes in \a i. Must be
4083/// 1, 2, 4, or 8.
4084/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4085#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4086 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4087 (double const *)(m), \
4088 (__v4di)(__m256i)(i), \
4089 (__v4df)(__m256d)(mask), (s)))
4090
4091/// Conditionally gathers four 32-bit floating-point values, either from the
4092/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4093/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4094/// of [4 x float] in \a mask determines the source for each element.
4095///
4096/// \code{.operation}
4097/// FOR element := 0 to 3
4098/// j := element*32
4099/// k := element*32
4100/// IF mask[j+31] == 0
4101/// result[j+31:j] := a[j+31:j]
4102/// ELSE
4103/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4104/// FI
4105/// ENDFOR
4106/// \endcode
4107///
4108/// \headerfile <immintrin.h>
4109///
4110/// \code
4111/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4112/// __m128 mask, const int s);
4113/// \endcode
4114///
4115/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4116///
4117/// \param a
4118/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4119/// zero.
4120/// \param m
4121/// A pointer to the memory used for loading values.
4122/// \param i
4123/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4124/// \param mask
4125/// A 128-bit vector of [4 x float] containing the mask. The most
4126/// significant bit of each element in the mask vector represents the mask
4127/// bits. If a mask bit is zero, the corresponding value from vector \a a
4128/// is gathered; otherwise the value is loaded from memory.
4129/// \param s
4130/// A literal constant scale factor for the indexes in \a i. Must be
4131/// 1, 2, 4, or 8.
4132/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4133#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4134 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4135 (float const *)(m), \
4136 (__v4si)(__m128i)(i), \
4137 (__v4sf)(__m128)(mask), (s)))
4138
4139/// Conditionally gathers eight 32-bit floating-point values, either from the
4140/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4141/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4142/// of [8 x float] in \a mask determines the source for each element.
4143///
4144/// \code{.operation}
4145/// FOR element := 0 to 7
4146/// j := element*32
4147/// k := element*32
4148/// IF mask[j+31] == 0
4149/// result[j+31:j] := a[j+31:j]
4150/// ELSE
4151/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4152/// FI
4153/// ENDFOR
4154/// \endcode
4155///
4156/// \headerfile <immintrin.h>
4157///
4158/// \code
4159/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4160/// __m256 mask, const int s);
4161/// \endcode
4162///
4163/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4164///
4165/// \param a
4166/// A 256-bit vector of [8 x float] used as the source when a mask bit is
4167/// zero.
4168/// \param m
4169/// A pointer to the memory used for loading values.
4170/// \param i
4171/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4172/// \param mask
4173/// A 256-bit vector of [8 x float] containing the mask. The most
4174/// significant bit of each element in the mask vector represents the mask
4175/// bits. If a mask bit is zero, the corresponding value from vector \a a
4176/// is gathered; otherwise the value is loaded from memory.
4177/// \param s
4178/// A literal constant scale factor for the indexes in \a i. Must be
4179/// 1, 2, 4, or 8.
4180/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4181#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4182 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4183 (float const *)(m), \
4184 (__v8si)(__m256i)(i), \
4185 (__v8sf)(__m256)(mask), (s)))
4186
4187/// Conditionally gathers two 32-bit floating-point values, either from the
4188/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4189/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4190/// of [4 x float] in \a mask determines the source for the lower two
4191/// elements. The upper two elements of the result are zeroed.
4192///
4193/// \code{.operation}
4194/// FOR element := 0 to 1
4195/// j := element*32
4196/// k := element*64
4197/// IF mask[j+31] == 0
4198/// result[j+31:j] := a[j+31:j]
4199/// ELSE
4200/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4201/// FI
4202/// ENDFOR
4203/// result[127:64] := 0
4204/// \endcode
4205///
4206/// \headerfile <immintrin.h>
4207///
4208/// \code
4209/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4210/// __m128 mask, const int s);
4211/// \endcode
4212///
4213/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4214///
4215/// \param a
4216/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4217/// zero. Only the first two elements are used.
4218/// \param m
4219/// A pointer to the memory used for loading values.
4220/// \param i
4221/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4222/// \param mask
4223/// A 128-bit vector of [4 x float] containing the mask. The most
4224/// significant bit of each element in the mask vector represents the mask
4225/// bits. If a mask bit is zero, the corresponding value from vector \a a
4226/// is gathered; otherwise the value is loaded from memory. Only the first
4227/// two elements are used.
4228/// \param s
4229/// A literal constant scale factor for the indexes in \a i. Must be
4230/// 1, 2, 4, or 8.
4231/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4232#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4233 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4234 (float const *)(m), \
4235 (__v2di)(__m128i)(i), \
4236 (__v4sf)(__m128)(mask), (s)))
4237
4238/// Conditionally gathers four 32-bit floating-point values, either from the
4239/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4240/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4241/// of [4 x float] in \a mask determines the source for each element.
4242///
4243/// \code{.operation}
4244/// FOR element := 0 to 3
4245/// j := element*32
4246/// k := element*64
4247/// IF mask[j+31] == 0
4248/// result[j+31:j] := a[j+31:j]
4249/// ELSE
4250/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4251/// FI
4252/// ENDFOR
4253/// \endcode
4254///
4255/// \headerfile <immintrin.h>
4256///
4257/// \code
4258/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4259/// __m128 mask, const int s);
4260/// \endcode
4261///
4262/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4263///
4264/// \param a
4265/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4266/// zero.
4267/// \param m
4268/// A pointer to the memory used for loading values.
4269/// \param i
4270/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4271/// \param mask
4272/// A 128-bit vector of [4 x float] containing the mask. The most
4273/// significant bit of each element in the mask vector represents the mask
4274/// bits. If a mask bit is zero, the corresponding value from vector \a a
4275/// is gathered; otherwise the value is loaded from memory.
4276/// \param s
4277/// A literal constant scale factor for the indexes in \a i. Must be
4278/// 1, 2, 4, or 8.
4279/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4280#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4281 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4282 (float const *)(m), \
4283 (__v4di)(__m256i)(i), \
4284 (__v4sf)(__m128)(mask), (s)))
4285
4286/// Conditionally gathers four 32-bit integer values, either from the
4287/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4288/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4289/// of [4 x i32] in \a mask determines the source for each element.
4290///
4291/// \code{.operation}
4292/// FOR element := 0 to 3
4293/// j := element*32
4294/// k := element*32
4295/// IF mask[j+31] == 0
4296/// result[j+31:j] := a[j+31:j]
4297/// ELSE
4298/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4299/// FI
4300/// ENDFOR
4301/// \endcode
4302///
4303/// \headerfile <immintrin.h>
4304///
4305/// \code
4306/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4307/// __m128i mask, const int s);
4308/// \endcode
4309///
4310/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4311///
4312/// \param a
4313/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4314/// zero.
4315/// \param m
4316/// A pointer to the memory used for loading values.
4317/// \param i
4318/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4319/// \param mask
4320/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4321/// bit of each element in the mask vector represents the mask bits. If a
4322/// mask bit is zero, the corresponding value from vector \a a is gathered;
4323/// otherwise the value is loaded from memory.
4324/// \param s
4325/// A literal constant scale factor for the indexes in \a i. Must be
4326/// 1, 2, 4, or 8.
4327/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4328#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4329 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4330 (int const *)(m), \
4331 (__v4si)(__m128i)(i), \
4332 (__v4si)(__m128i)(mask), (s)))
4333
4334/// Conditionally gathers eight 32-bit integer values, either from the
4335/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4336/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4337/// of [8 x i32] in \a mask determines the source for each element.
4338///
4339/// \code{.operation}
4340/// FOR element := 0 to 7
4341/// j := element*32
4342/// k := element*32
4343/// IF mask[j+31] == 0
4344/// result[j+31:j] := a[j+31:j]
4345/// ELSE
4346/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4347/// FI
4348/// ENDFOR
4349/// \endcode
4350///
4351/// \headerfile <immintrin.h>
4352///
4353/// \code
4354/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4355/// __m256i mask, const int s);
4356/// \endcode
4357///
4358/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4359///
4360/// \param a
4361/// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4362/// zero.
4363/// \param m
4364/// A pointer to the memory used for loading values.
4365/// \param i
4366/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4367/// \param mask
4368/// A 256-bit vector of [8 x i32] containing the mask. The most significant
4369/// bit of each element in the mask vector represents the mask bits. If a
4370/// mask bit is zero, the corresponding value from vector \a a is gathered;
4371/// otherwise the value is loaded from memory.
4372/// \param s
4373/// A literal constant scale factor for the indexes in \a i. Must be
4374/// 1, 2, 4, or 8.
4375/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4376#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4377 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4378 (int const *)(m), \
4379 (__v8si)(__m256i)(i), \
4380 (__v8si)(__m256i)(mask), (s)))
4381
4382/// Conditionally gathers two 32-bit integer values, either from the
4383/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4384/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4385/// of [4 x i32] in \a mask determines the source for the lower two
4386/// elements. The upper two elements of the result are zeroed.
4387///
4388/// \code{.operation}
4389/// FOR element := 0 to 1
4390/// j := element*32
4391/// k := element*64
4392/// IF mask[j+31] == 0
4393/// result[j+31:j] := a[j+31:j]
4394/// ELSE
4395/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4396/// FI
4397/// ENDFOR
4398/// result[127:64] := 0
4399/// \endcode
4400///
4401/// \headerfile <immintrin.h>
4402///
4403/// \code
4404/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4405/// __m128i mask, const int s);
4406/// \endcode
4407///
4408/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4409///
4410/// \param a
4411/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4412/// zero. Only the first two elements are used.
4413/// \param m
4414/// A pointer to the memory used for loading values.
4415/// \param i
4416/// A 128-bit vector of [2 x i64] containing indexes into \a m.
4417/// \param mask
4418/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4419/// bit of each element in the mask vector represents the mask bits. If a
4420/// mask bit is zero, the corresponding value from vector \a a is gathered;
4421/// otherwise the value is loaded from memory. Only the first two elements
4422/// are used.
4423/// \param s
4424/// A literal constant scale factor for the indexes in \a i. Must be
4425/// 1, 2, 4, or 8.
4426/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4427#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4428 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4429 (int const *)(m), \
4430 (__v2di)(__m128i)(i), \
4431 (__v4si)(__m128i)(mask), (s)))
4432
4433/// Conditionally gathers four 32-bit integer values, either from the
4434/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4435/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4436/// of [4 x i32] in \a mask determines the source for each element.
4437///
4438/// \code{.operation}
4439/// FOR element := 0 to 3
4440/// j := element*32
4441/// k := element*64
4442/// IF mask[j+31] == 0
4443/// result[j+31:j] := a[j+31:j]
4444/// ELSE
4445/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4446/// FI
4447/// ENDFOR
4448/// \endcode
4449///
4450/// \headerfile <immintrin.h>
4451///
4452/// \code
4453/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4454/// __m128i mask, const int s);
4455/// \endcode
4456///
4457/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4458///
4459/// \param a
4460/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4461/// zero.
4462/// \param m
4463/// A pointer to the memory used for loading values.
4464/// \param i
4465/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4466/// \param mask
4467/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4468/// bit of each element in the mask vector represents the mask bits. If a
4469/// mask bit is zero, the corresponding value from vector \a a is gathered;
4470/// otherwise the value is loaded from memory.
4471/// \param s
4472/// A literal constant scale factor for the indexes in \a i. Must be
4473/// 1, 2, 4, or 8.
4474/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4475#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4476 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4477 (int const *)(m), \
4478 (__v4di)(__m256i)(i), \
4479 (__v4si)(__m128i)(mask), (s)))
4480
4481/// Conditionally gathers two 64-bit integer values, either from the
4482/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4483/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4484/// of [2 x i64] in \a mask determines the source for each element.
4485///
4486/// \code{.operation}
4487/// FOR element := 0 to 1
4488/// j := element*64
4489/// k := element*32
4490/// IF mask[j+63] == 0
4491/// result[j+63:j] := a[j+63:j]
4492/// ELSE
4493/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4494/// FI
4495/// ENDFOR
4496/// \endcode
4497///
4498/// \headerfile <immintrin.h>
4499///
4500/// \code
4501/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4502/// __m128i mask, const int s);
4503/// \endcode
4504///
4505/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4506///
4507/// \param a
4508/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4509/// zero.
4510/// \param m
4511/// A pointer to the memory used for loading values.
4512/// \param i
4513/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4514/// the first two elements are used.
4515/// \param mask
4516/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4517/// bit of each element in the mask vector represents the mask bits. If a
4518/// mask bit is zero, the corresponding value from vector \a a is gathered;
4519/// otherwise the value is loaded from memory.
4520/// \param s
4521/// A literal constant scale factor for the indexes in \a i. Must be
4522/// 1, 2, 4, or 8.
4523/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4524#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4525 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4526 (long long const *)(m), \
4527 (__v4si)(__m128i)(i), \
4528 (__v2di)(__m128i)(mask), (s)))
4529
4530/// Conditionally gathers four 64-bit integer values, either from the
4531/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4532/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4533/// of [4 x i64] in \a mask determines the source for each element.
4534///
4535/// \code{.operation}
4536/// FOR element := 0 to 3
4537/// j := element*64
4538/// k := element*32
4539/// IF mask[j+63] == 0
4540/// result[j+63:j] := a[j+63:j]
4541/// ELSE
4542/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4543/// FI
4544/// ENDFOR
4545/// \endcode
4546///
4547/// \headerfile <immintrin.h>
4548///
4549/// \code
4550/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4551/// __m128i i, __m256i mask, const int s);
4552/// \endcode
4553///
4554/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4555///
4556/// \param a
4557/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4558/// zero.
4559/// \param m
4560/// A pointer to the memory used for loading values.
4561/// \param i
4562/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4563/// \param mask
4564/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4565/// bit of each element in the mask vector represents the mask bits. If a
4566/// mask bit is zero, the corresponding value from vector \a a is gathered;
4567/// otherwise the value is loaded from memory.
4568/// \param s
4569/// A literal constant scale factor for the indexes in \a i. Must be
4570/// 1, 2, 4, or 8.
4571/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4572#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4573 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4574 (long long const *)(m), \
4575 (__v4si)(__m128i)(i), \
4576 (__v4di)(__m256i)(mask), (s)))
4577
4578/// Conditionally gathers two 64-bit integer values, either from the
4579/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4580/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4581/// of [2 x i64] in \a mask determines the source for each element.
4582///
4583/// \code{.operation}
4584/// FOR element := 0 to 1
4585/// j := element*64
4586/// k := element*64
4587/// IF mask[j+63] == 0
4588/// result[j+63:j] := a[j+63:j]
4589/// ELSE
4590/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4591/// FI
4592/// ENDFOR
4593/// \endcode
4594///
4595/// \headerfile <immintrin.h>
4596///
4597/// \code
4598/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4599/// __m128i mask, const int s);
4600/// \endcode
4601///
4602/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4603///
4604/// \param a
4605/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4606/// zero.
4607/// \param m
4608/// A pointer to the memory used for loading values.
4609/// \param i
4610/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4611/// \param mask
4612/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4613/// bit of each element in the mask vector represents the mask bits. If a
4614/// mask bit is zero, the corresponding value from vector \a a is gathered;
4615/// otherwise the value is loaded from memory.
4616/// \param s
4617/// A literal constant scale factor for the indexes in \a i. Must be
4618/// 1, 2, 4, or 8.
4619/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4620#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4621 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4622 (long long const *)(m), \
4623 (__v2di)(__m128i)(i), \
4624 (__v2di)(__m128i)(mask), (s)))
4625
4626/// Conditionally gathers four 64-bit integer values, either from the
4627/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4628/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4629/// of [4 x i64] in \a mask determines the source for each element.
4630///
4631/// \code{.operation}
4632/// FOR element := 0 to 3
4633/// j := element*64
4634/// k := element*64
4635/// IF mask[j+63] == 0
4636/// result[j+63:j] := a[j+63:j]
4637/// ELSE
4638/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4639/// FI
4640/// ENDFOR
4641/// \endcode
4642///
4643/// \headerfile <immintrin.h>
4644///
4645/// \code
4646/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4647/// __m256i i, __m256i mask, const int s);
4648/// \endcode
4649///
4650/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4651///
4652/// \param a
4653/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4654/// zero.
4655/// \param m
4656/// A pointer to the memory used for loading values.
4657/// \param i
4658/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4659/// \param mask
4660/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4661/// bit of each element in the mask vector represents the mask bits. If a
4662/// mask bit is zero, the corresponding value from vector \a a is gathered;
4663/// otherwise the value is loaded from memory.
4664/// \param s
4665/// A literal constant scale factor for the indexes in \a i. Must be
4666/// 1, 2, 4, or 8.
4667/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4668#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4669 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4670 (long long const *)(m), \
4671 (__v4di)(__m256i)(i), \
4672 (__v4di)(__m256i)(mask), (s)))
4673
4674/// Gathers two 64-bit floating-point values from memory \a m using scaled
4675/// indexes from the 128-bit vector of [4 x i32] in \a i.
4676///
4677/// \code{.operation}
4678/// FOR element := 0 to 1
4679/// j := element*64
4680/// k := element*32
4681/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4682/// ENDFOR
4683/// \endcode
4684///
4685/// \headerfile <immintrin.h>
4686///
4687/// \code
4688/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4689/// \endcode
4690///
4691/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4692///
4693/// \param m
4694/// A pointer to the memory used for loading values.
4695/// \param i
4696/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4697/// the first two elements are used.
4698/// \param s
4699/// A literal constant scale factor for the indexes in \a i. Must be
4700/// 1, 2, 4, or 8.
4701/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4702#define _mm_i32gather_pd(m, i, s) \
4703 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4704 (double const *)(m), \
4705 (__v4si)(__m128i)(i), \
4706 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4707 _mm_setzero_pd()), \
4708 (s)))
4709
4710/// Gathers four 64-bit floating-point values from memory \a m using scaled
4711/// indexes from the 128-bit vector of [4 x i32] in \a i.
4712///
4713/// \code{.operation}
4714/// FOR element := 0 to 3
4715/// j := element*64
4716/// k := element*32
4717/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4718/// ENDFOR
4719/// \endcode
4720///
4721/// \headerfile <immintrin.h>
4722///
4723/// \code
4724/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4725/// \endcode
4726///
4727/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4728///
4729/// \param m
4730/// A pointer to the memory used for loading values.
4731/// \param i
4732/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4733/// \param s
4734/// A literal constant scale factor for the indexes in \a i. Must be
4735/// 1, 2, 4, or 8.
4736/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4737#define _mm256_i32gather_pd(m, i, s) \
4738 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4739 (double const *)(m), \
4740 (__v4si)(__m128i)(i), \
4741 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4742 _mm256_setzero_pd(), \
4743 _CMP_EQ_OQ), \
4744 (s)))
4745
4746/// Gathers two 64-bit floating-point values from memory \a m using scaled
4747/// indexes from the 128-bit vector of [2 x i64] in \a i.
4748///
4749/// \code{.operation}
4750/// FOR element := 0 to 1
4751/// j := element*64
4752/// k := element*64
4753/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4754/// ENDFOR
4755/// \endcode
4756///
4757/// \headerfile <immintrin.h>
4758///
4759/// \code
4760/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4761/// \endcode
4762///
4763/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4764///
4765/// \param m
4766/// A pointer to the memory used for loading values.
4767/// \param i
4768/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4769/// \param s
4770/// A literal constant scale factor for the indexes in \a i. Must be
4771/// 1, 2, 4, or 8.
4772/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4773#define _mm_i64gather_pd(m, i, s) \
4774 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4775 (double const *)(m), \
4776 (__v2di)(__m128i)(i), \
4777 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4778 _mm_setzero_pd()), \
4779 (s)))
4780
4781/// Gathers four 64-bit floating-point values from memory \a m using scaled
4782/// indexes from the 256-bit vector of [4 x i64] in \a i.
4783///
4784/// \code{.operation}
4785/// FOR element := 0 to 3
4786/// j := element*64
4787/// k := element*64
4788/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4789/// ENDFOR
4790/// \endcode
4791///
4792/// \headerfile <immintrin.h>
4793///
4794/// \code
4795/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4796/// \endcode
4797///
4798/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4799///
4800/// \param m
4801/// A pointer to the memory used for loading values.
4802/// \param i
4803/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4804/// \param s
4805/// A literal constant scale factor for the indexes in \a i. Must be
4806/// 1, 2, 4, or 8.
4807/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4808#define _mm256_i64gather_pd(m, i, s) \
4809 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4810 (double const *)(m), \
4811 (__v4di)(__m256i)(i), \
4812 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4813 _mm256_setzero_pd(), \
4814 _CMP_EQ_OQ), \
4815 (s)))
4816
4817/// Gathers four 32-bit floating-point values from memory \a m using scaled
4818/// indexes from the 128-bit vector of [4 x i32] in \a i.
4819///
4820/// \code{.operation}
4821/// FOR element := 0 to 3
4822/// j := element*32
4823/// k := element*32
4824/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4825/// ENDFOR
4826/// \endcode
4827///
4828/// \headerfile <immintrin.h>
4829///
4830/// \code
4831/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4832/// \endcode
4833///
4834/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4835///
4836/// \param m
4837/// A pointer to the memory used for loading values.
4838/// \param i
4839/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4840/// \param s
4841/// A literal constant scale factor for the indexes in \a i. Must be
4842/// 1, 2, 4, or 8.
4843/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4844#define _mm_i32gather_ps(m, i, s) \
4845 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4846 (float const *)(m), \
4847 (__v4si)(__m128i)(i), \
4848 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4849 _mm_setzero_ps()), \
4850 (s)))
4851
4852/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4853/// indexes from the 256-bit vector of [8 x i32] in \a i.
4854///
4855/// \code{.operation}
4856/// FOR element := 0 to 7
4857/// j := element*32
4858/// k := element*32
4859/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4860/// ENDFOR
4861/// \endcode
4862///
4863/// \headerfile <immintrin.h>
4864///
4865/// \code
4866/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4867/// \endcode
4868///
4869/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4870///
4871/// \param m
4872/// A pointer to the memory used for loading values.
4873/// \param i
4874/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4875/// \param s
4876/// A literal constant scale factor for the indexes in \a i. Must be
4877/// 1, 2, 4, or 8.
4878/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4879#define _mm256_i32gather_ps(m, i, s) \
4880 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4881 (float const *)(m), \
4882 (__v8si)(__m256i)(i), \
4883 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4884 _mm256_setzero_ps(), \
4885 _CMP_EQ_OQ), \
4886 (s)))
4887
4888/// Gathers two 32-bit floating-point values from memory \a m using scaled
4889/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4890/// elements of the result are zeroed.
4891///
4892/// \code{.operation}
4893/// FOR element := 0 to 1
4894/// j := element*32
4895/// k := element*64
4896/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4897/// ENDFOR
4898/// result[127:64] := 0
4899/// \endcode
4900///
4901/// \headerfile <immintrin.h>
4902///
4903/// \code
4904/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4905/// \endcode
4906///
4907/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4908///
4909/// \param m
4910/// A pointer to the memory used for loading values.
4911/// \param i
4912/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4913/// \param s
4914/// A literal constant scale factor for the indexes in \a i. Must be
4915/// 1, 2, 4, or 8.
4916/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4917#define _mm_i64gather_ps(m, i, s) \
4918 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4919 (float const *)(m), \
4920 (__v2di)(__m128i)(i), \
4921 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4922 _mm_setzero_ps()), \
4923 (s)))
4924
4925/// Gathers four 32-bit floating-point values from memory \a m using scaled
4926/// indexes from the 256-bit vector of [4 x i64] in \a i.
4927///
4928/// \code{.operation}
4929/// FOR element := 0 to 3
4930/// j := element*32
4931/// k := element*64
4932/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4933/// ENDFOR
4934/// \endcode
4935///
4936/// \headerfile <immintrin.h>
4937///
4938/// \code
4939/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4940/// \endcode
4941///
4942/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4943///
4944/// \param m
4945/// A pointer to the memory used for loading values.
4946/// \param i
4947/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4948/// \param s
4949/// A literal constant scale factor for the indexes in \a i. Must be
4950/// 1, 2, 4, or 8.
4951/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4952#define _mm256_i64gather_ps(m, i, s) \
4953 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
4954 (float const *)(m), \
4955 (__v4di)(__m256i)(i), \
4956 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4957 _mm_setzero_ps()), \
4958 (s)))
4959
4960/// Gathers four 32-bit floating-point values from memory \a m using scaled
4961/// indexes from the 128-bit vector of [4 x i32] in \a i.
4962///
4963/// \code{.operation}
4964/// FOR element := 0 to 3
4965/// j := element*32
4966/// k := element*32
4967/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4968/// ENDFOR
4969/// \endcode
4970///
4971/// \headerfile <immintrin.h>
4972///
4973/// \code
4974/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
4975/// \endcode
4976///
4977/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4978///
4979/// \param m
4980/// A pointer to the memory used for loading values.
4981/// \param i
4982/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4983/// \param s
4984/// A literal constant scale factor for the indexes in \a i. Must be
4985/// 1, 2, 4, or 8.
4986/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4987#define _mm_i32gather_epi32(m, i, s) \
4988 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
4989 (int const *)(m), (__v4si)(__m128i)(i), \
4990 (__v4si)_mm_set1_epi32(-1), (s)))
4991
4992/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4993/// indexes from the 256-bit vector of [8 x i32] in \a i.
4994///
4995/// \code{.operation}
4996/// FOR element := 0 to 7
4997/// j := element*32
4998/// k := element*32
4999/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5000/// ENDFOR
5001/// \endcode
5002///
5003/// \headerfile <immintrin.h>
5004///
5005/// \code
5006/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5007/// \endcode
5008///
5009/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5010///
5011/// \param m
5012/// A pointer to the memory used for loading values.
5013/// \param i
5014/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5015/// \param s
5016/// A literal constant scale factor for the indexes in \a i. Must be
5017/// 1, 2, 4, or 8.
5018/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5019#define _mm256_i32gather_epi32(m, i, s) \
5020 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5021 (int const *)(m), (__v8si)(__m256i)(i), \
5022 (__v8si)_mm256_set1_epi32(-1), (s)))
5023
5024/// Gathers two 32-bit integer values from memory \a m using scaled indexes
5025/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5026/// of the result are zeroed.
5027///
5028/// \code{.operation}
5029/// FOR element := 0 to 1
5030/// j := element*32
5031/// k := element*64
5032/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5033/// ENDFOR
5034/// result[127:64] := 0
5035/// \endcode
5036///
5037/// \headerfile <immintrin.h>
5038///
5039/// \code
5040/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5041/// \endcode
5042///
5043/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5044///
5045/// \param m
5046/// A pointer to the memory used for loading values.
5047/// \param i
5048/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5049/// \param s
5050/// A literal constant scale factor for the indexes in \a i. Must be
5051/// 1, 2, 4, or 8.
5052/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5053#define _mm_i64gather_epi32(m, i, s) \
5054 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5055 (int const *)(m), (__v2di)(__m128i)(i), \
5056 (__v4si)_mm_set1_epi32(-1), (s)))
5057
5058/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5059/// from the 256-bit vector of [4 x i64] in \a i.
5060///
5061/// \code{.operation}
5062/// FOR element := 0 to 3
5063/// j := element*32
5064/// k := element*64
5065/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5066/// ENDFOR
5067/// \endcode
5068///
5069/// \headerfile <immintrin.h>
5070///
5071/// \code
5072/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5073/// \endcode
5074///
5075/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5076///
5077/// \param m
5078/// A pointer to the memory used for loading values.
5079/// \param i
5080/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5081/// \param s
5082/// A literal constant scale factor for the indexes in \a i. Must be
5083/// 1, 2, 4, or 8.
5084/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5085#define _mm256_i64gather_epi32(m, i, s) \
5086 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5087 (int const *)(m), (__v4di)(__m256i)(i), \
5088 (__v4si)_mm_set1_epi32(-1), (s)))
5089
5090/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5091/// from the 128-bit vector of [4 x i32] in \a i.
5092///
5093/// \code{.operation}
5094/// FOR element := 0 to 1
5095/// j := element*64
5096/// k := element*32
5097/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5098/// ENDFOR
5099/// \endcode
5100///
5101/// \headerfile <immintrin.h>
5102///
5103/// \code
5104/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5105/// \endcode
5106///
5107/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5108///
5109/// \param m
5110/// A pointer to the memory used for loading values.
5111/// \param i
5112/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5113/// the first two elements are used.
5114/// \param s
5115/// A literal constant scale factor for the indexes in \a i. Must be
5116/// 1, 2, 4, or 8.
5117/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5118#define _mm_i32gather_epi64(m, i, s) \
5119 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5120 (long long const *)(m), \
5121 (__v4si)(__m128i)(i), \
5122 (__v2di)_mm_set1_epi64x(-1), (s)))
5123
5124/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5125/// from the 128-bit vector of [4 x i32] in \a i.
5126///
5127/// \code{.operation}
5128/// FOR element := 0 to 3
5129/// j := element*64
5130/// k := element*32
5131/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5132/// ENDFOR
5133/// \endcode
5134///
5135/// \headerfile <immintrin.h>
5136///
5137/// \code
5138/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5139/// \endcode
5140///
5141/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5142///
5143/// \param m
5144/// A pointer to the memory used for loading values.
5145/// \param i
5146/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5147/// \param s
5148/// A literal constant scale factor for the indexes in \a i. Must be
5149/// 1, 2, 4, or 8.
5150/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5151#define _mm256_i32gather_epi64(m, i, s) \
5152 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5153 (long long const *)(m), \
5154 (__v4si)(__m128i)(i), \
5155 (__v4di)_mm256_set1_epi64x(-1), (s)))
5156
5157/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5158/// from the 128-bit vector of [2 x i64] in \a i.
5159///
5160/// \code{.operation}
5161/// FOR element := 0 to 1
5162/// j := element*64
5163/// k := element*64
5164/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5165/// ENDFOR
5166/// \endcode
5167///
5168/// \headerfile <immintrin.h>
5169///
5170/// \code
5171/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5172/// \endcode
5173///
5174/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5175///
5176/// \param m
5177/// A pointer to the memory used for loading values.
5178/// \param i
5179/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5180/// \param s
5181/// A literal constant scale factor for the indexes in \a i. Must be
5182/// 1, 2, 4, or 8.
5183/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5184#define _mm_i64gather_epi64(m, i, s) \
5185 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5186 (long long const *)(m), \
5187 (__v2di)(__m128i)(i), \
5188 (__v2di)_mm_set1_epi64x(-1), (s)))
5189
5190/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5191/// from the 256-bit vector of [4 x i64] in \a i.
5192///
5193/// \code{.operation}
5194/// FOR element := 0 to 3
5195/// j := element*64
5196/// k := element*64
5197/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5198/// ENDFOR
5199/// \endcode
5200///
5201/// \headerfile <immintrin.h>
5202///
5203/// \code
5204/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5205/// \endcode
5206///
5207/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5208///
5209/// \param m
5210/// A pointer to the memory used for loading values.
5211/// \param i
5212/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5213/// \param s
5214/// A literal constant scale factor for the indexes in \a i. Must be
5215/// 1, 2, 4, or 8.
5216/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5217#define _mm256_i64gather_epi64(m, i, s) \
5218 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5219 (long long const *)(m), \
5220 (__v4di)(__m256i)(i), \
5221 (__v4di)_mm256_set1_epi64x(-1), (s)))
5222
5223#undef __DEFAULT_FN_ATTRS256
5224#undef __DEFAULT_FN_ATTRS128
5225#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
5226#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5227
5228#endif /* __AVX2INTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
Definition altivec.h:578
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi32_epi64(__m128i __V)
Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and re...
Definition avx2intrin.h:846
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256(const void *__V)
Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vec...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than an...
Definition avx2intrin.h:730
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epu32(__m256i __a, __m256i __b)
Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srav_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_andnot_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit int...
Definition avx2intrin.h:474
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation...
Definition avx2intrin.h:394
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi32(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation,...
Definition avx2intrin.h:201
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64(long long const *__X, __m256i __M)
Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to mem...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epu16(__m256i __a, __m256i __b)
Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upp...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256...
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastsd_pd(__m128d __a)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32(int const *__X, __m256i __M)
Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi16(__m128i __V)
Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsi...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi32(__m256i __a)
Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a a...
Definition avx2intrin.h:139
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi64(__m256i __a, __m256i __b)
Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed by...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and r...
Definition avx2intrin.h:949
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srav_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi64(__m128i __V)
Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and r...
Definition avx2intrin.h:678
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed sa...
Definition avx2intrin.h:376
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits spec...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memor...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 b...
Definition avx2intrin.h:283
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu32_epi64(__m128i __V)
Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi32(__m128i __V)
Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the...
Definition avx2intrin.h:302
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x f...
static __inline__ int __DEFAULT_FN_ATTRS256 _mm256_movemask_epi8(__m256i __a)
Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vecto...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi16(__m128i __V)
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the ...
Definition avx2intrin.h:559
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and r...
Definition avx2intrin.h:704
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi64(__m256i __a, __m256i __b)
Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the ...
Definition avx2intrin.h:340
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32(int const *__X, __m128i __M)
Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadds_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using ...
Definition avx2intrin.h:913
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and re...
Definition avx2intrin.h:981
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a, __m256i __b)
Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greate...
Definition avx2intrin.h:758
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi32(__m128i __V)
Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result'...
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi16(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation,...
Definition avx2intrin.h:169
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater...
Definition avx2intrin.h:784
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned satur...
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using sign...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns t...
Definition avx2intrin.h:626
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi8(__m256i __a)
Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each va...
Definition avx2intrin.h:107
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi64(__m128i __V)
Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16(__m256i __a, __m256i __b)
Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit interme...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi64(__m128i __V)
Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation,...
Definition avx2intrin.h:264
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi64(__m128i __V)
Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_and_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vectors in __a and __b.
Definition avx2intrin.h:456
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsi128_si256(__m128i __X)
Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi32(__m256i __a, __m256i __b)
Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi16(__m256i __a)
Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a ...
Definition avx2intrin.h:123
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_or_si256(__m256i __a, __m256i __b)
Computes the bitwise OR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi16(__m256i __a, __m256i __b)
Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation,...
Definition avx2intrin.h:232
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsd_pd(__m128d __X)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi32(__m128i __V)
Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu16(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16...
Definition avx2intrin.h:525
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_xor_si256(__m256i __a, __m256i __b)
Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi32(__m128i __V)
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned ...
Definition avx2intrin.h:411
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64(long long const *__X, __m128i __M)
Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the c...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in _...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and ret...
Definition avx2intrin.h:878
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturat...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater...
Definition avx2intrin.h:810
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a, __m256i __b)
Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers fr...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi32(__m256i __a, __m256i __b)
Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the ...
Definition avx2intrin.h:321
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation,...
Definition avx2intrin.h:359
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and ...
Definition avx2intrin.h:652
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu8(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a a...
Definition avx2intrin.h:500
static __inline__ void int __a
Definition emmintrin.h:4077
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19