clang 22.0.0git
avx2intrin.h
Go to the documentation of this file.
1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX2INTRIN_H
15#define __AVX2INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
20 __min_vector_width__(256)))
21#define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
23 __min_vector_width__(128)))
24
25#if defined(__cplusplus) && (__cplusplus >= 201103L)
26#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
27#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
28#else
29#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
30#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
31#endif
32
33/* SSE4 Multiple Packed Sums of Absolute Difference. */
34/// Computes sixteen sum of absolute difference (SAD) operations on sets of
35/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
36/// \a Y.
37///
38/// Eight SAD results are computed using the lower half of the input
39/// vectors, and another eight using the upper half. These 16-bit values
40/// are returned in the lower and upper halves of the 256-bit result,
41/// respectively.
42///
43/// A single SAD operation selects four bytes from \a X and four bytes from
44/// \a Y as input. It computes the differences between each \a X byte and
45/// the corresponding \a Y byte, takes the absolute value of each
46/// difference, and sums these four values to form one 16-bit result. The
47/// intrinsic computes 16 of these results with different sets of input
48/// bytes.
49///
50/// For each set of eight results, the SAD operations use the same four
51/// bytes from \a Y; the starting bit position for these four bytes is
52/// specified by \a M[1:0] times 32. The eight operations use successive
53/// sets of four bytes from \a X; the starting bit position for the first
54/// set of four bytes is specified by \a M[2] times 32. These bit positions
55/// are all relative to the 128-bit lane for each set of eight operations.
56///
57/// \code{.operation}
58/// r := 0
59/// FOR i := 0 TO 1
60/// j := i*3
61/// Ybase := M[j+1:j]*32 + i*128
62/// Xbase := M[j+2]*32 + i*128
63/// FOR k := 0 TO 3
64/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
65/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
66/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
67/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
68/// result[r+15:r] := temp0 + temp1 + temp2 + temp3
69/// Xbase := Xbase + 8
70/// r := r + 16
71/// ENDFOR
72/// ENDFOR
73/// \endcode
74///
75/// \headerfile <immintrin.h>
76///
77/// \code
78/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
79/// \endcode
80///
81/// This intrinsic corresponds to the \c VMPSADBW instruction.
82///
83/// \param X
84/// A 256-bit integer vector containing one of the inputs.
85/// \param Y
86/// A 256-bit integer vector containing one of the inputs.
87/// \param M
88/// An unsigned immediate value specifying the starting positions of the
89/// bytes to operate on.
90/// \returns A 256-bit vector of [16 x i16] containing the result.
91#define _mm256_mpsadbw_epu8(X, Y, M) \
92 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
93 (__v32qi)(__m256i)(Y), (int)(M)))
94
95/// Computes the absolute value of each signed byte in the 256-bit integer
96/// vector \a __a and returns each value in the corresponding byte of
97/// the result.
98///
99/// \headerfile <immintrin.h>
100///
101/// This intrinsic corresponds to the \c VPABSB instruction.
102///
103/// \param __a
104/// A 256-bit integer vector.
105/// \returns A 256-bit integer vector containing the result.
106static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
108 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
109}
110
111/// Computes the absolute value of each signed 16-bit element in the 256-bit
112/// vector of [16 x i16] in \a __a and returns each value in the
113/// corresponding element of the result.
114///
115/// \headerfile <immintrin.h>
116///
117/// This intrinsic corresponds to the \c VPABSW instruction.
118///
119/// \param __a
120/// A 256-bit vector of [16 x i16].
121/// \returns A 256-bit vector of [16 x i16] containing the result.
122static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
124 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
125}
126
127/// Computes the absolute value of each signed 32-bit element in the 256-bit
128/// vector of [8 x i32] in \a __a and returns each value in the
129/// corresponding element of the result.
130///
131/// \headerfile <immintrin.h>
132///
133/// This intrinsic corresponds to the \c VPABSD instruction.
134///
135/// \param __a
136/// A 256-bit vector of [8 x i32].
137/// \returns A 256-bit vector of [8 x i32] containing the result.
138static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
140 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
141}
142
143/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
144/// integers using signed saturation, and returns the 256-bit result.
145///
146/// \code{.operation}
147/// FOR i := 0 TO 7
148/// j := i*16
149/// k := i*8
150/// result[7+k:k] := SATURATE8(__a[15+j:j])
151/// result[71+k:64+k] := SATURATE8(__b[15+j:j])
152/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
153/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
154/// ENDFOR
155/// \endcode
156///
157/// \headerfile <immintrin.h>
158///
159/// This intrinsic corresponds to the \c VPACKSSWB instruction.
160///
161/// \param __a
162/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
163/// result[191:128].
164/// \param __b
165/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
166/// result[255:192].
167/// \returns A 256-bit integer vector containing the result.
168static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
169_mm256_packs_epi16(__m256i __a, __m256i __b) {
170 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
171}
172
173/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
174/// integers using signed saturation, and returns the resulting 256-bit
175/// vector of [16 x i16].
176///
177/// \code{.operation}
178/// FOR i := 0 TO 3
179/// j := i*32
180/// k := i*16
181/// result[15+k:k] := SATURATE16(__a[31+j:j])
182/// result[79+k:64+k] := SATURATE16(__b[31+j:j])
183/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
184/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
185/// ENDFOR
186/// \endcode
187///
188/// \headerfile <immintrin.h>
189///
190/// This intrinsic corresponds to the \c VPACKSSDW instruction.
191///
192/// \param __a
193/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
194/// result[191:128].
195/// \param __b
196/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
197/// result[255:192].
198/// \returns A 256-bit vector of [16 x i16] containing the result.
199static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
200_mm256_packs_epi32(__m256i __a, __m256i __b) {
201 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
202}
203
204/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
205/// using unsigned saturation, and returns the 256-bit result.
206///
207/// \code{.operation}
208/// FOR i := 0 TO 7
209/// j := i*16
210/// k := i*8
211/// result[7+k:k] := SATURATE8U(__a[15+j:j])
212/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
213/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
214/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
215/// ENDFOR
216/// \endcode
217///
218/// \headerfile <immintrin.h>
219///
220/// This intrinsic corresponds to the \c VPACKUSWB instruction.
221///
222/// \param __a
223/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
224/// result[191:128].
225/// \param __b
226/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
227/// result[255:192].
228/// \returns A 256-bit integer vector containing the result.
229static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
230_mm256_packus_epi16(__m256i __a, __m256i __b) {
231 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
232}
233
234/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
235/// using unsigned saturation, and returns the resulting 256-bit vector of
236/// [16 x i16].
237///
238/// \code{.operation}
239/// FOR i := 0 TO 3
240/// j := i*32
241/// k := i*16
242/// result[15+k:k] := SATURATE16U(__V1[31+j:j])
243/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
244/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
245/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
246/// ENDFOR
247/// \endcode
248///
249/// \headerfile <immintrin.h>
250///
251/// This intrinsic corresponds to the \c VPACKUSDW instruction.
252///
253/// \param __V1
254/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
255/// result[191:128].
256/// \param __V2
257/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
258/// result[255:192].
259/// \returns A 256-bit vector of [16 x i16] containing the result.
260static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
261_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
262 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
263}
264
265/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
266/// vectors and returns the lower 8 bits of each sum in the corresponding
267/// byte of the 256-bit integer vector result (overflow is ignored).
268///
269/// \headerfile <immintrin.h>
270///
271/// This intrinsic corresponds to the \c VPADDB instruction.
272///
273/// \param __a
274/// A 256-bit integer vector containing one of the source operands.
275/// \param __b
276/// A 256-bit integer vector containing one of the source operands.
277/// \returns A 256-bit integer vector containing the sums.
278static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
279_mm256_add_epi8(__m256i __a, __m256i __b) {
280 return (__m256i)((__v32qu)__a + (__v32qu)__b);
281}
282
283/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284/// [16 x i16] and returns the lower 16 bits of each sum in the
285/// corresponding element of the [16 x i16] result (overflow is ignored).
286///
287/// \headerfile <immintrin.h>
288///
289/// This intrinsic corresponds to the \c VPADDW instruction.
290///
291/// \param __a
292/// A 256-bit vector of [16 x i16] containing one of the source operands.
293/// \param __b
294/// A 256-bit vector of [16 x i16] containing one of the source operands.
295/// \returns A 256-bit vector of [16 x i16] containing the sums.
296static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
297_mm256_add_epi16(__m256i __a, __m256i __b) {
298 return (__m256i)((__v16hu)__a + (__v16hu)__b);
299}
300
301/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
302/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
303/// element of the [8 x i32] result (overflow is ignored).
304///
305/// \headerfile <immintrin.h>
306///
307/// This intrinsic corresponds to the \c VPADDD instruction.
308///
309/// \param __a
310/// A 256-bit vector of [8 x i32] containing one of the source operands.
311/// \param __b
312/// A 256-bit vector of [8 x i32] containing one of the source operands.
313/// \returns A 256-bit vector of [8 x i32] containing the sums.
314static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
315_mm256_add_epi32(__m256i __a, __m256i __b) {
316 return (__m256i)((__v8su)__a + (__v8su)__b);
317}
318
319/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
320/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
321/// element of the [4 x i64] result (overflow is ignored).
322///
323/// \headerfile <immintrin.h>
324///
325/// This intrinsic corresponds to the \c VPADDQ instruction.
326///
327/// \param __a
328/// A 256-bit vector of [4 x i64] containing one of the source operands.
329/// \param __b
330/// A 256-bit vector of [4 x i64] containing one of the source operands.
331/// \returns A 256-bit vector of [4 x i64] containing the sums.
332static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
333_mm256_add_epi64(__m256i __a, __m256i __b) {
334 return (__m256i)((__v4du)__a + (__v4du)__b);
335}
336
337/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
338/// vectors using signed saturation, and returns each sum in the
339/// corresponding byte of the 256-bit integer vector result.
340///
341/// \headerfile <immintrin.h>
342///
343/// This intrinsic corresponds to the \c VPADDSB instruction.
344///
345/// \param __a
346/// A 256-bit integer vector containing one of the source operands.
347/// \param __b
348/// A 256-bit integer vector containing one of the source operands.
349/// \returns A 256-bit integer vector containing the sums.
350static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
351_mm256_adds_epi8(__m256i __a, __m256i __b) {
352 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
353}
354
355/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
356/// [16 x i16] using signed saturation, and returns the [16 x i16] result.
357///
358/// \headerfile <immintrin.h>
359///
360/// This intrinsic corresponds to the \c VPADDSW instruction.
361///
362/// \param __a
363/// A 256-bit vector of [16 x i16] containing one of the source operands.
364/// \param __b
365/// A 256-bit vector of [16 x i16] containing one of the source operands.
366/// \returns A 256-bit vector of [16 x i16] containing the sums.
367static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
368_mm256_adds_epi16(__m256i __a, __m256i __b) {
369 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
370}
371
372/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
373/// vectors using unsigned saturation, and returns each sum in the
374/// corresponding byte of the 256-bit integer vector result.
375///
376/// \headerfile <immintrin.h>
377///
378/// This intrinsic corresponds to the \c VPADDUSB instruction.
379///
380/// \param __a
381/// A 256-bit integer vector containing one of the source operands.
382/// \param __b
383/// A 256-bit integer vector containing one of the source operands.
384/// \returns A 256-bit integer vector containing the sums.
385static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
386_mm256_adds_epu8(__m256i __a, __m256i __b) {
387 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
388}
389
390/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
391/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
392///
393/// \headerfile <immintrin.h>
394///
395/// This intrinsic corresponds to the \c VPADDUSW instruction.
396///
397/// \param __a
398/// A 256-bit vector of [16 x i16] containing one of the source operands.
399/// \param __b
400/// A 256-bit vector of [16 x i16] containing one of the source operands.
401/// \returns A 256-bit vector of [16 x i16] containing the sums.
402static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
403_mm256_adds_epu16(__m256i __a, __m256i __b) {
404 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
405}
406
407/// Uses the lower half of the 256-bit vector \a a as the upper half of a
408/// temporary 256-bit value, and the lower half of the 256-bit vector \a b
409/// as the lower half of the temporary value. Right-shifts the temporary
410/// value by \a n bytes, and uses the lower 16 bytes of the shifted value
411/// as the lower 16 bytes of the result. Uses the upper halves of \a a and
412/// \a b to make another temporary value, right shifts by \a n, and uses
413/// the lower 16 bytes of the shifted value as the upper 16 bytes of the
414/// result.
415///
416/// \headerfile <immintrin.h>
417///
418/// \code
419/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
420/// \endcode
421///
422/// This intrinsic corresponds to the \c VPALIGNR instruction.
423///
424/// \param a
425/// A 256-bit integer vector containing source values.
426/// \param b
427/// A 256-bit integer vector containing source values.
428/// \param n
429/// An immediate value specifying the number of bytes to shift.
430/// \returns A 256-bit integer vector containing the result.
431#define _mm256_alignr_epi8(a, b, n) \
432 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
433 (__v32qi)(__m256i)(b), (n)))
434
435/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
436/// \a __b.
437///
438/// \headerfile <immintrin.h>
439///
440/// This intrinsic corresponds to the \c VPAND instruction.
441///
442/// \param __a
443/// A 256-bit integer vector.
444/// \param __b
445/// A 256-bit integer vector.
446/// \returns A 256-bit integer vector containing the result.
447static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
448_mm256_and_si256(__m256i __a, __m256i __b)
449{
450 return (__m256i)((__v4du)__a & (__v4du)__b);
451}
452
453/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
454/// the bitwise NOT of the 256-bit integer vector in \a __a.
455///
456/// \headerfile <immintrin.h>
457///
458/// This intrinsic corresponds to the \c VPANDN instruction.
459///
460/// \param __a
461/// A 256-bit integer vector.
462/// \param __b
463/// A 256-bit integer vector.
464/// \returns A 256-bit integer vector containing the result.
465static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
466_mm256_andnot_si256(__m256i __a, __m256i __b)
467{
468 return (__m256i)(~(__v4du)__a & (__v4du)__b);
469}
470
471/// Computes the averages of the corresponding unsigned bytes in the two
472/// 256-bit integer vectors in \a __a and \a __b and returns each
473/// average in the corresponding byte of the 256-bit result.
474///
475/// \code{.operation}
476/// FOR i := 0 TO 31
477/// j := i*8
478/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
479/// ENDFOR
480/// \endcode
481///
482/// \headerfile <immintrin.h>
483///
484/// This intrinsic corresponds to the \c VPAVGB instruction.
485///
486/// \param __a
487/// A 256-bit integer vector.
488/// \param __b
489/// A 256-bit integer vector.
490/// \returns A 256-bit integer vector containing the result.
491static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
492_mm256_avg_epu8(__m256i __a, __m256i __b) {
493 return (__m256i)__builtin_ia32_pavgb256((__v32qu)__a, (__v32qu)__b);
494}
495
496/// Computes the averages of the corresponding unsigned 16-bit integers in
497/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
498/// each average in the corresponding element of the 256-bit result.
499///
500/// \code{.operation}
501/// FOR i := 0 TO 15
502/// j := i*16
503/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
504/// ENDFOR
505/// \endcode
506///
507/// \headerfile <immintrin.h>
508///
509/// This intrinsic corresponds to the \c VPAVGW instruction.
510///
511/// \param __a
512/// A 256-bit vector of [16 x i16].
513/// \param __b
514/// A 256-bit vector of [16 x i16].
515/// \returns A 256-bit vector of [16 x i16] containing the result.
516static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
517_mm256_avg_epu16(__m256i __a, __m256i __b) {
518 return (__m256i)__builtin_ia32_pavgw256((__v16hu)__a, (__v16hu)__b);
519}
520
521/// Merges 8-bit integer values from either of the two 256-bit vectors
522/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
523/// the resulting 256-bit integer vector.
524///
525/// \code{.operation}
526/// FOR i := 0 TO 31
527/// j := i*8
528/// IF __M[7+i] == 0
529/// result[7+j:j] := __V1[7+j:j]
530/// ELSE
531/// result[7+j:j] := __V2[7+j:j]
532/// FI
533/// ENDFOR
534/// \endcode
535///
536/// \headerfile <immintrin.h>
537///
538/// This intrinsic corresponds to the \c VPBLENDVB instruction.
539///
540/// \param __V1
541/// A 256-bit integer vector containing source values.
542/// \param __V2
543/// A 256-bit integer vector containing source values.
544/// \param __M
545/// A 256-bit integer vector, with bit [7] of each byte specifying the
546/// source for each corresponding byte of the result. When the mask bit
547/// is 0, the byte is copied from \a __V1; otherwise, it is copied from
548/// \a __V2.
549/// \returns A 256-bit integer vector containing the result.
550static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
551_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) {
552 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
553 (__v32qi)__M);
554}
555
556/// Merges 16-bit integer values from either of the two 256-bit vectors
557/// \a V1 or \a V2, as specified by the immediate integer operand \a M,
558/// and returns the resulting 256-bit vector of [16 x i16].
559///
560/// \code{.operation}
561/// FOR i := 0 TO 7
562/// j := i*16
563/// IF M[i] == 0
564/// result[7+j:j] := V1[7+j:j]
565/// result[135+j:128+j] := V1[135+j:128+j]
566/// ELSE
567/// result[7+j:j] := V2[7+j:j]
568/// result[135+j:128+j] := V2[135+j:128+j]
569/// FI
570/// ENDFOR
571/// \endcode
572///
573/// \headerfile <immintrin.h>
574///
575/// \code
576/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
577/// \endcode
578///
579/// This intrinsic corresponds to the \c VPBLENDW instruction.
580///
581/// \param V1
582/// A 256-bit vector of [16 x i16] containing source values.
583/// \param V2
584/// A 256-bit vector of [16 x i16] containing source values.
585/// \param M
586/// An immediate 8-bit integer operand, with bits [7:0] specifying the
587/// source for each element of the result. The position of the mask bit
588/// corresponds to the index of a copied value. When a mask bit is 0, the
589/// element is copied from \a V1; otherwise, it is copied from \a V2.
590/// \a M[0] determines the source for elements 0 and 8, \a M[1] for
591/// elements 1 and 9, and so forth.
592/// \returns A 256-bit vector of [16 x i16] containing the result.
593#define _mm256_blend_epi16(V1, V2, M) \
594 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
595 (__v16hi)(__m256i)(V2), (int)(M)))
596
597/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
598/// \a __b for equality and returns the outcomes in the corresponding
599/// bytes of the 256-bit result.
600///
601/// \code{.operation}
602/// FOR i := 0 TO 31
603/// j := i*8
604/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
605/// ENDFOR
606/// \endcode
607///
608/// \headerfile <immintrin.h>
609///
610/// This intrinsic corresponds to the \c VPCMPEQB instruction.
611///
612/// \param __a
613/// A 256-bit integer vector containing one of the inputs.
614/// \param __b
615/// A 256-bit integer vector containing one of the inputs.
616/// \returns A 256-bit integer vector containing the result.
617static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
618_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
619{
620 return (__m256i)((__v32qi)__a == (__v32qi)__b);
621}
622
623/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
624/// \a __a and \a __b for equality and returns the outcomes in the
625/// corresponding elements of the 256-bit result.
626///
627/// \code{.operation}
628/// FOR i := 0 TO 15
629/// j := i*16
630/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
631/// ENDFOR
632/// \endcode
633///
634/// \headerfile <immintrin.h>
635///
636/// This intrinsic corresponds to the \c VPCMPEQW instruction.
637///
638/// \param __a
639/// A 256-bit vector of [16 x i16] containing one of the inputs.
640/// \param __b
641/// A 256-bit vector of [16 x i16] containing one of the inputs.
642/// \returns A 256-bit vector of [16 x i16] containing the result.
643static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
644_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
645{
646 return (__m256i)((__v16hi)__a == (__v16hi)__b);
647}
648
649/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
650/// \a __a and \a __b for equality and returns the outcomes in the
651/// corresponding elements of the 256-bit result.
652///
653/// \code{.operation}
654/// FOR i := 0 TO 7
655/// j := i*32
656/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
657/// ENDFOR
658/// \endcode
659///
660/// \headerfile <immintrin.h>
661///
662/// This intrinsic corresponds to the \c VPCMPEQD instruction.
663///
664/// \param __a
665/// A 256-bit vector of [8 x i32] containing one of the inputs.
666/// \param __b
667/// A 256-bit vector of [8 x i32] containing one of the inputs.
668/// \returns A 256-bit vector of [8 x i32] containing the result.
669static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
670_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
671{
672 return (__m256i)((__v8si)__a == (__v8si)__b);
673}
674
675/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
676/// \a __a and \a __b for equality and returns the outcomes in the
677/// corresponding elements of the 256-bit result.
678///
679/// \code{.operation}
680/// FOR i := 0 TO 3
681/// j := i*64
682/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
683/// ENDFOR
684/// \endcode
685///
686/// \headerfile <immintrin.h>
687///
688/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
689///
690/// \param __a
691/// A 256-bit vector of [4 x i64] containing one of the inputs.
692/// \param __b
693/// A 256-bit vector of [4 x i64] containing one of the inputs.
694/// \returns A 256-bit vector of [4 x i64] containing the result.
695static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
696_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
697{
698 return (__m256i)((__v4di)__a == (__v4di)__b);
699}
700
701/// Compares corresponding signed bytes in the 256-bit integer vectors in
702/// \a __a and \a __b for greater-than and returns the outcomes in the
703/// corresponding bytes of the 256-bit result.
704///
705/// \code{.operation}
706/// FOR i := 0 TO 31
707/// j := i*8
708/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
709/// ENDFOR
710/// \endcode
711///
712/// \headerfile <immintrin.h>
713///
714/// This intrinsic corresponds to the \c VPCMPGTB instruction.
715///
716/// \param __a
717/// A 256-bit integer vector containing one of the inputs.
718/// \param __b
719/// A 256-bit integer vector containing one of the inputs.
720/// \returns A 256-bit integer vector containing the result.
721static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
722_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
723{
724 /* This function always performs a signed comparison, but __v32qi is a char
725 which may be signed or unsigned, so use __v32qs. */
726 return (__m256i)((__v32qs)__a > (__v32qs)__b);
727}
728
729/// Compares corresponding signed elements in the 256-bit vectors of
730/// [16 x i16] in \a __a and \a __b for greater-than and returns the
731/// outcomes in the corresponding elements of the 256-bit result.
732///
733/// \code{.operation}
734/// FOR i := 0 TO 15
735/// j := i*16
736/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
737/// ENDFOR
738/// \endcode
739///
740/// \headerfile <immintrin.h>
741///
742/// This intrinsic corresponds to the \c VPCMPGTW instruction.
743///
744/// \param __a
745/// A 256-bit vector of [16 x i16] containing one of the inputs.
746/// \param __b
747/// A 256-bit vector of [16 x i16] containing one of the inputs.
748/// \returns A 256-bit vector of [16 x i16] containing the result.
749static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
750_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
751{
752 return (__m256i)((__v16hi)__a > (__v16hi)__b);
753}
754
755/// Compares corresponding signed elements in the 256-bit vectors of
756/// [8 x i32] in \a __a and \a __b for greater-than and returns the
757/// outcomes in the corresponding elements of the 256-bit result.
758///
759/// \code{.operation}
760/// FOR i := 0 TO 7
761/// j := i*32
762/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
763/// ENDFOR
764/// \endcode
765///
766/// \headerfile <immintrin.h>
767///
768/// This intrinsic corresponds to the \c VPCMPGTD instruction.
769///
770/// \param __a
771/// A 256-bit vector of [8 x i32] containing one of the inputs.
772/// \param __b
773/// A 256-bit vector of [8 x i32] containing one of the inputs.
774/// \returns A 256-bit vector of [8 x i32] containing the result.
775static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
776_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
777{
778 return (__m256i)((__v8si)__a > (__v8si)__b);
779}
780
781/// Compares corresponding signed elements in the 256-bit vectors of
782/// [4 x i64] in \a __a and \a __b for greater-than and returns the
783/// outcomes in the corresponding elements of the 256-bit result.
784///
785/// \code{.operation}
786/// FOR i := 0 TO 3
787/// j := i*64
788/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
789/// ENDFOR
790/// \endcode
791///
792/// \headerfile <immintrin.h>
793///
794/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
795///
796/// \param __a
797/// A 256-bit vector of [4 x i64] containing one of the inputs.
798/// \param __b
799/// A 256-bit vector of [4 x i64] containing one of the inputs.
800/// \returns A 256-bit vector of [4 x i64] containing the result.
801static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
802_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
803{
804 return (__m256i)((__v4di)__a > (__v4di)__b);
805}
806
807/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
808/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
809/// element of the [16 x i16] result (overflow is ignored). Sums from
810/// \a __a are returned in the lower 64 bits of each 128-bit half of the
811/// result; sums from \a __b are returned in the upper 64 bits of each
812/// 128-bit half of the result.
813///
814/// \code{.operation}
815/// FOR i := 0 TO 1
816/// j := i*128
817/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
818/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
819/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
820/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
821/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
822/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
823/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
824/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
825/// ENDFOR
826/// \endcode
827///
828/// \headerfile <immintrin.h>
829///
830/// This intrinsic corresponds to the \c VPHADDW instruction.
831///
832/// \param __a
833/// A 256-bit vector of [16 x i16] containing one of the source operands.
834/// \param __b
835/// A 256-bit vector of [16 x i16] containing one of the source operands.
836/// \returns A 256-bit vector of [16 x i16] containing the sums.
837static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
838_mm256_hadd_epi16(__m256i __a, __m256i __b) {
839 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
840}
841
842/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
843/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
844/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
845/// are returned in the lower 64 bits of each 128-bit half of the result;
846/// sums from \a __b are returned in the upper 64 bits of each 128-bit half
847/// of the result.
848///
849/// \code{.operation}
850/// FOR i := 0 TO 1
851/// j := i*128
852/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
853/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
854/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
855/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
856/// ENDFOR
857/// \endcode
858///
859/// \headerfile <immintrin.h>
860///
861/// This intrinsic corresponds to the \c VPHADDD instruction.
862///
863/// \param __a
864/// A 256-bit vector of [8 x i32] containing one of the source operands.
865/// \param __b
866/// A 256-bit vector of [8 x i32] containing one of the source operands.
867/// \returns A 256-bit vector of [8 x i32] containing the sums.
868static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
869_mm256_hadd_epi32(__m256i __a, __m256i __b) {
870 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
871}
872
873/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
874/// vectors of [16 x i16] using signed saturation and returns each sum in
875/// an element of the [16 x i16] result. Sums from \a __a are returned in
876/// the lower 64 bits of each 128-bit half of the result; sums from \a __b
877/// are returned in the upper 64 bits of each 128-bit half of the result.
878///
879/// \code{.operation}
880/// FOR i := 0 TO 1
881/// j := i*128
882/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
883/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
884/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
885/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
886/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
887/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
888/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
889/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
890/// ENDFOR
891/// \endcode
892///
893/// \headerfile <immintrin.h>
894///
895/// This intrinsic corresponds to the \c VPHADDSW instruction.
896///
897/// \param __a
898/// A 256-bit vector of [16 x i16] containing one of the source operands.
899/// \param __b
900/// A 256-bit vector of [16 x i16] containing one of the source operands.
901/// \returns A 256-bit vector of [16 x i16] containing the sums.
902static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
903_mm256_hadds_epi16(__m256i __a, __m256i __b) {
904 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
905}
906
907/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
908/// vectors of [16 x i16] and returns the lower 16 bits of each difference
909/// in an element of the [16 x i16] result (overflow is ignored).
910/// Differences from \a __a are returned in the lower 64 bits of each
911/// 128-bit half of the result; differences from \a __b are returned in the
912/// upper 64 bits of each 128-bit half of the result.
913///
914/// \code{.operation}
915/// FOR i := 0 TO 1
916/// j := i*128
917/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
918/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
919/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
920/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
921/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
922/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
923/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
924/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
925/// ENDFOR
926/// \endcode
927///
928/// \headerfile <immintrin.h>
929///
930/// This intrinsic corresponds to the \c VPHSUBW instruction.
931///
932/// \param __a
933/// A 256-bit vector of [16 x i16] containing one of the source operands.
934/// \param __b
935/// A 256-bit vector of [16 x i16] containing one of the source operands.
936/// \returns A 256-bit vector of [16 x i16] containing the differences.
937static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
938_mm256_hsub_epi16(__m256i __a, __m256i __b) {
939 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
940}
941
942/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
943/// vectors of [8 x i32] and returns the lower 32 bits of each difference in
944/// an element of the [8 x i32] result (overflow is ignored). Differences
945/// from \a __a are returned in the lower 64 bits of each 128-bit half of
946/// the result; differences from \a __b are returned in the upper 64 bits
947/// of each 128-bit half of the result.
948///
949/// \code{.operation}
950/// FOR i := 0 TO 1
951/// j := i*128
952/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
953/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
954/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
955/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
956/// ENDFOR
957/// \endcode
958///
959/// \headerfile <immintrin.h>
960///
961/// This intrinsic corresponds to the \c VPHSUBD instruction.
962///
963/// \param __a
964/// A 256-bit vector of [8 x i32] containing one of the source operands.
965/// \param __b
966/// A 256-bit vector of [8 x i32] containing one of the source operands.
967/// \returns A 256-bit vector of [8 x i32] containing the differences.
968static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
969_mm256_hsub_epi32(__m256i __a, __m256i __b) {
970 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
971}
972
973/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
974/// vectors of [16 x i16] using signed saturation and returns each sum in
975/// an element of the [16 x i16] result. Differences from \a __a are
976/// returned in the lower 64 bits of each 128-bit half of the result;
977/// differences from \a __b are returned in the upper 64 bits of each
978/// 128-bit half of the result.
979///
980/// \code{.operation}
981/// FOR i := 0 TO 1
982/// j := i*128
983/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
984/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
985/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
986/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
987/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
988/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
989/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
990/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
991/// ENDFOR
992/// \endcode
993///
994/// \headerfile <immintrin.h>
995///
996/// This intrinsic corresponds to the \c VPHSUBSW instruction.
997///
998/// \param __a
999/// A 256-bit vector of [16 x i16] containing one of the source operands.
1000/// \param __b
1001/// A 256-bit vector of [16 x i16] containing one of the source operands.
1002/// \returns A 256-bit vector of [16 x i16] containing the differences.
1003static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1004_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
1005 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1006}
1007
1008/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1009/// with the corresponding signed byte from the 256-bit integer vector in
1010/// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1011/// pairs of those products using signed saturation to form 16-bit sums
1012/// returned as elements of the [16 x i16] result.
1013///
1014/// \code{.operation}
1015/// FOR i := 0 TO 15
1016/// j := i*16
1017/// temp1 := __a[j+7:j] * __b[j+7:j]
1018/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1019/// result[j+15:j] := SATURATE16(temp1 + temp2)
1020/// ENDFOR
1021/// \endcode
1022///
1023/// \headerfile <immintrin.h>
1024///
1025/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1026///
1027/// \param __a
1028/// A 256-bit vector containing one of the source operands.
1029/// \param __b
1030/// A 256-bit vector containing one of the source operands.
1031/// \returns A 256-bit vector of [16 x i16] containing the result.
1032static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1033_mm256_maddubs_epi16(__m256i __a, __m256i __b) {
1034 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1035}
1036
1037/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1038/// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1039/// those products to form 32-bit sums returned as elements of the
1040/// [8 x i32] result.
1041///
1042/// There is only one wraparound case: when all four of the 16-bit sources
1043/// are \c 0x8000, the result will be \c 0x80000000.
1044///
1045/// \code{.operation}
1046/// FOR i := 0 TO 7
1047/// j := i*32
1048/// temp1 := __a[j+15:j] * __b[j+15:j]
1049/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1050/// result[j+31:j] := temp1 + temp2
1051/// ENDFOR
1052/// \endcode
1053///
1054/// \headerfile <immintrin.h>
1055///
1056/// This intrinsic corresponds to the \c VPMADDWD instruction.
1057///
1058/// \param __a
1059/// A 256-bit vector of [16 x i16] containing one of the source operands.
1060/// \param __b
1061/// A 256-bit vector of [16 x i16] containing one of the source operands.
1062/// \returns A 256-bit vector of [8 x i32] containing the result.
1063static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1064_mm256_madd_epi16(__m256i __a, __m256i __b) {
1065 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1066}
1067
1068/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1069/// in \a __a and \a __b and returns the larger of each pair in the
1070/// corresponding byte of the 256-bit result.
1071///
1072/// \headerfile <immintrin.h>
1073///
1074/// This intrinsic corresponds to the \c VPMAXSB instruction.
1075///
1076/// \param __a
1077/// A 256-bit integer vector.
1078/// \param __b
1079/// A 256-bit integer vector.
1080/// \returns A 256-bit integer vector containing the result.
1081static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1082_mm256_max_epi8(__m256i __a, __m256i __b) {
1083 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1084}
1085
1086/// Compares the corresponding signed 16-bit integers in the two 256-bit
1087/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1088/// each pair in the corresponding element of the 256-bit result.
1089///
1090/// \headerfile <immintrin.h>
1091///
1092/// This intrinsic corresponds to the \c VPMAXSW instruction.
1093///
1094/// \param __a
1095/// A 256-bit vector of [16 x i16].
1096/// \param __b
1097/// A 256-bit vector of [16 x i16].
1098/// \returns A 256-bit vector of [16 x i16] containing the result.
1099static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1100_mm256_max_epi16(__m256i __a, __m256i __b) {
1101 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1102}
1103
1104/// Compares the corresponding signed 32-bit integers in the two 256-bit
1105/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1106/// each pair in the corresponding element of the 256-bit result.
1107///
1108/// \headerfile <immintrin.h>
1109///
1110/// This intrinsic corresponds to the \c VPMAXSD instruction.
1111///
1112/// \param __a
1113/// A 256-bit vector of [8 x i32].
1114/// \param __b
1115/// A 256-bit vector of [8 x i32].
1116/// \returns A 256-bit vector of [8 x i32] containing the result.
1117static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1118_mm256_max_epi32(__m256i __a, __m256i __b) {
1119 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1120}
1121
1122/// Compares the corresponding unsigned bytes in the two 256-bit integer
1123/// vectors in \a __a and \a __b and returns the larger of each pair in
1124/// the corresponding byte of the 256-bit result.
1125///
1126/// \headerfile <immintrin.h>
1127///
1128/// This intrinsic corresponds to the \c VPMAXUB instruction.
1129///
1130/// \param __a
1131/// A 256-bit integer vector.
1132/// \param __b
1133/// A 256-bit integer vector.
1134/// \returns A 256-bit integer vector containing the result.
1135static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1136_mm256_max_epu8(__m256i __a, __m256i __b) {
1137 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1138}
1139
1140/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1141/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1142/// each pair in the corresponding element of the 256-bit result.
1143///
1144/// \headerfile <immintrin.h>
1145///
1146/// This intrinsic corresponds to the \c VPMAXUW instruction.
1147///
1148/// \param __a
1149/// A 256-bit vector of [16 x i16].
1150/// \param __b
1151/// A 256-bit vector of [16 x i16].
1152/// \returns A 256-bit vector of [16 x i16] containing the result.
1153static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1154_mm256_max_epu16(__m256i __a, __m256i __b) {
1155 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1156}
1157
1158/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1159/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1160/// each pair in the corresponding element of the 256-bit result.
1161///
1162/// \headerfile <immintrin.h>
1163///
1164/// This intrinsic corresponds to the \c VPMAXUD instruction.
1165///
1166/// \param __a
1167/// A 256-bit vector of [8 x i32].
1168/// \param __b
1169/// A 256-bit vector of [8 x i32].
1170/// \returns A 256-bit vector of [8 x i32] containing the result.
1171static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1172_mm256_max_epu32(__m256i __a, __m256i __b) {
1173 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1174}
1175
1176/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1177/// in \a __a and \a __b and returns the smaller of each pair in the
1178/// corresponding byte of the 256-bit result.
1179///
1180/// \headerfile <immintrin.h>
1181///
1182/// This intrinsic corresponds to the \c VPMINSB instruction.
1183///
1184/// \param __a
1185/// A 256-bit integer vector.
1186/// \param __b
1187/// A 256-bit integer vector.
1188/// \returns A 256-bit integer vector containing the result.
1189static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1190_mm256_min_epi8(__m256i __a, __m256i __b) {
1191 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1192}
1193
1194/// Compares the corresponding signed 16-bit integers in the two 256-bit
1195/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1196/// each pair in the corresponding element of the 256-bit result.
1197///
1198/// \headerfile <immintrin.h>
1199///
1200/// This intrinsic corresponds to the \c VPMINSW instruction.
1201///
1202/// \param __a
1203/// A 256-bit vector of [16 x i16].
1204/// \param __b
1205/// A 256-bit vector of [16 x i16].
1206/// \returns A 256-bit vector of [16 x i16] containing the result.
1207static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1208_mm256_min_epi16(__m256i __a, __m256i __b) {
1209 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1210}
1211
1212/// Compares the corresponding signed 32-bit integers in the two 256-bit
1213/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1214/// each pair in the corresponding element of the 256-bit result.
1215///
1216/// \headerfile <immintrin.h>
1217///
1218/// This intrinsic corresponds to the \c VPMINSD instruction.
1219///
1220/// \param __a
1221/// A 256-bit vector of [8 x i32].
1222/// \param __b
1223/// A 256-bit vector of [8 x i32].
1224/// \returns A 256-bit vector of [8 x i32] containing the result.
1225static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1226_mm256_min_epi32(__m256i __a, __m256i __b) {
1227 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1228}
1229
1230/// Compares the corresponding unsigned bytes in the two 256-bit integer
1231/// vectors in \a __a and \a __b and returns the smaller of each pair in
1232/// the corresponding byte of the 256-bit result.
1233///
1234/// \headerfile <immintrin.h>
1235///
1236/// This intrinsic corresponds to the \c VPMINUB instruction.
1237///
1238/// \param __a
1239/// A 256-bit integer vector.
1240/// \param __b
1241/// A 256-bit integer vector.
1242/// \returns A 256-bit integer vector containing the result.
1243static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1244_mm256_min_epu8(__m256i __a, __m256i __b) {
1245 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1246}
1247
1248/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1249/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1250/// each pair in the corresponding element of the 256-bit result.
1251///
1252/// \headerfile <immintrin.h>
1253///
1254/// This intrinsic corresponds to the \c VPMINUW instruction.
1255///
1256/// \param __a
1257/// A 256-bit vector of [16 x i16].
1258/// \param __b
1259/// A 256-bit vector of [16 x i16].
1260/// \returns A 256-bit vector of [16 x i16] containing the result.
1261static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1262_mm256_min_epu16(__m256i __a, __m256i __b) {
1263 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1264}
1265
1266/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1267/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1268/// each pair in the corresponding element of the 256-bit result.
1269///
1270/// \headerfile <immintrin.h>
1271///
1272/// This intrinsic corresponds to the \c VPMINUD instruction.
1273///
1274/// \param __a
1275/// A 256-bit vector of [8 x i32].
1276/// \param __b
1277/// A 256-bit vector of [8 x i32].
1278/// \returns A 256-bit vector of [8 x i32] containing the result.
1279static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1280_mm256_min_epu32(__m256i __a, __m256i __b) {
1281 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1282}
1283
1284/// Creates a 32-bit integer mask from the most significant bit of each byte
1285/// in the 256-bit integer vector in \a __a and returns the result.
1286///
1287/// \code{.operation}
1288/// FOR i := 0 TO 31
1289/// j := i*8
1290/// result[i] := __a[j+7]
1291/// ENDFOR
1292/// \endcode
1293///
1294/// \headerfile <immintrin.h>
1295///
1296/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1297///
1298/// \param __a
1299/// A 256-bit integer vector containing the source bytes.
1300/// \returns The 32-bit integer mask.
1301static __inline__ int __DEFAULT_FN_ATTRS256
1303{
1304 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1305}
1306
1307/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1308/// the 16-bit values in the corresponding elements of a 256-bit vector
1309/// of [16 x i16].
1310///
1311/// \code{.operation}
1312/// FOR i := 0 TO 15
1313/// j := i*8
1314/// k := i*16
1315/// result[k+15:k] := SignExtend(__V[j+7:j])
1316/// ENDFOR
1317/// \endcode
1318///
1319/// \headerfile <immintrin.h>
1320///
1321/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1322///
1323/// \param __V
1324/// A 128-bit integer vector containing the source bytes.
1325/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1326/// values.
1327static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1329 /* This function always performs a signed extension, but __v16qi is a char
1330 which may be signed or unsigned, so use __v16qs. */
1331 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1332}
1333
1334/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1335/// \a __V and returns the 32-bit values in the corresponding elements of a
1336/// 256-bit vector of [8 x i32].
1337///
1338/// \code{.operation}
1339/// FOR i := 0 TO 7
1340/// j := i*8
1341/// k := i*32
1342/// result[k+31:k] := SignExtend(__V[j+7:j])
1343/// ENDFOR
1344/// \endcode
1345///
1346/// \headerfile <immintrin.h>
1347///
1348/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1349///
1350/// \param __V
1351/// A 128-bit integer vector containing the source bytes.
1352/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1353/// values.
1354static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1356 /* This function always performs a signed extension, but __v16qi is a char
1357 which may be signed or unsigned, so use __v16qs. */
1358 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1359}
1360
1361/// Sign-extends the first four bytes from the 128-bit integer vector in
1362/// \a __V and returns the 64-bit values in the corresponding elements of a
1363/// 256-bit vector of [4 x i64].
1364///
1365/// \code{.operation}
1366/// result[63:0] := SignExtend(__V[7:0])
1367/// result[127:64] := SignExtend(__V[15:8])
1368/// result[191:128] := SignExtend(__V[23:16])
1369/// result[255:192] := SignExtend(__V[31:24])
1370/// \endcode
1371///
1372/// \headerfile <immintrin.h>
1373///
1374/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1375///
1376/// \param __V
1377/// A 128-bit integer vector containing the source bytes.
1378/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1379/// values.
1380static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1382 /* This function always performs a signed extension, but __v16qi is a char
1383 which may be signed or unsigned, so use __v16qs. */
1384 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1385}
1386
1387/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1388/// \a __V and returns the 32-bit values in the corresponding elements of a
1389/// 256-bit vector of [8 x i32].
1390///
1391/// \code{.operation}
1392/// FOR i := 0 TO 7
1393/// j := i*16
1394/// k := i*32
1395/// result[k+31:k] := SignExtend(__V[j+15:j])
1396/// ENDFOR
1397/// \endcode
1398///
1399/// \headerfile <immintrin.h>
1400///
1401/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1402///
1403/// \param __V
1404/// A 128-bit vector of [8 x i16] containing the source values.
1405/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1406/// values.
1407static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1409 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1410}
1411
1412/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1413/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1414/// elements of a 256-bit vector of [4 x i64].
1415///
1416/// \code{.operation}
1417/// result[63:0] := SignExtend(__V[15:0])
1418/// result[127:64] := SignExtend(__V[31:16])
1419/// result[191:128] := SignExtend(__V[47:32])
1420/// result[255:192] := SignExtend(__V[64:48])
1421/// \endcode
1422///
1423/// \headerfile <immintrin.h>
1424///
1425/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1426///
1427/// \param __V
1428/// A 128-bit vector of [8 x i16] containing the source values.
1429/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1430/// values.
1431static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1433 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1434}
1435
1436/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1437/// \a __V and returns the 64-bit values in the corresponding elements of a
1438/// 256-bit vector of [4 x i64].
1439///
1440/// \code{.operation}
1441/// result[63:0] := SignExtend(__V[31:0])
1442/// result[127:64] := SignExtend(__V[63:32])
1443/// result[191:128] := SignExtend(__V[95:64])
1444/// result[255:192] := SignExtend(__V[127:96])
1445/// \endcode
1446///
1447/// \headerfile <immintrin.h>
1448///
1449/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1450///
1451/// \param __V
1452/// A 128-bit vector of [4 x i32] containing the source values.
1453/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1454/// values.
1455static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1457 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1458}
1459
1460/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1461/// the 16-bit values in the corresponding elements of a 256-bit vector
1462/// of [16 x i16].
1463///
1464/// \code{.operation}
1465/// FOR i := 0 TO 15
1466/// j := i*8
1467/// k := i*16
1468/// result[k+15:k] := ZeroExtend(__V[j+7:j])
1469/// ENDFOR
1470/// \endcode
1471///
1472/// \headerfile <immintrin.h>
1473///
1474/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1475///
1476/// \param __V
1477/// A 128-bit integer vector containing the source bytes.
1478/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1479/// values.
1480static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1482 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1483}
1484
1485/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1486/// \a __V and returns the 32-bit values in the corresponding elements of a
1487/// 256-bit vector of [8 x i32].
1488///
1489/// \code{.operation}
1490/// FOR i := 0 TO 7
1491/// j := i*8
1492/// k := i*32
1493/// result[k+31:k] := ZeroExtend(__V[j+7:j])
1494/// ENDFOR
1495/// \endcode
1496///
1497/// \headerfile <immintrin.h>
1498///
1499/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1500///
1501/// \param __V
1502/// A 128-bit integer vector containing the source bytes.
1503/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1504/// values.
1505static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1507 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1508}
1509
1510/// Zero-extends the first four bytes from the 128-bit integer vector in
1511/// \a __V and returns the 64-bit values in the corresponding elements of a
1512/// 256-bit vector of [4 x i64].
1513///
1514/// \code{.operation}
1515/// result[63:0] := ZeroExtend(__V[7:0])
1516/// result[127:64] := ZeroExtend(__V[15:8])
1517/// result[191:128] := ZeroExtend(__V[23:16])
1518/// result[255:192] := ZeroExtend(__V[31:24])
1519/// \endcode
1520///
1521/// \headerfile <immintrin.h>
1522///
1523/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1524///
1525/// \param __V
1526/// A 128-bit integer vector containing the source bytes.
1527/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1528/// values.
1529static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1531 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1532}
1533
1534/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1535/// \a __V and returns the 32-bit values in the corresponding elements of a
1536/// 256-bit vector of [8 x i32].
1537///
1538/// \code{.operation}
1539/// FOR i := 0 TO 7
1540/// j := i*16
1541/// k := i*32
1542/// result[k+31:k] := ZeroExtend(__V[j+15:j])
1543/// ENDFOR
1544/// \endcode
1545///
1546/// \headerfile <immintrin.h>
1547///
1548/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1549///
1550/// \param __V
1551/// A 128-bit vector of [8 x i16] containing the source values.
1552/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1553/// values.
1554static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1556 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1557}
1558
1559/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1560/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1561/// elements of a 256-bit vector of [4 x i64].
1562///
1563/// \code{.operation}
1564/// result[63:0] := ZeroExtend(__V[15:0])
1565/// result[127:64] := ZeroExtend(__V[31:16])
1566/// result[191:128] := ZeroExtend(__V[47:32])
1567/// result[255:192] := ZeroExtend(__V[64:48])
1568/// \endcode
1569///
1570/// \headerfile <immintrin.h>
1571///
1572/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1573///
1574/// \param __V
1575/// A 128-bit vector of [8 x i16] containing the source values.
1576/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1577/// values.
1578static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1580 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1581}
1582
1583/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1584/// \a __V and returns the 64-bit values in the corresponding elements of a
1585/// 256-bit vector of [4 x i64].
1586///
1587/// \code{.operation}
1588/// result[63:0] := ZeroExtend(__V[31:0])
1589/// result[127:64] := ZeroExtend(__V[63:32])
1590/// result[191:128] := ZeroExtend(__V[95:64])
1591/// result[255:192] := ZeroExtend(__V[127:96])
1592/// \endcode
1593///
1594/// \headerfile <immintrin.h>
1595///
1596/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1597///
1598/// \param __V
1599/// A 128-bit vector of [4 x i32] containing the source values.
1600/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1601/// values.
1602static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1604 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1605}
1606
1607/// Multiplies signed 32-bit integers from even-numbered elements of two
1608/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1609/// [4 x i64] result.
1610///
1611/// \code{.operation}
1612/// result[63:0] := __a[31:0] * __b[31:0]
1613/// result[127:64] := __a[95:64] * __b[95:64]
1614/// result[191:128] := __a[159:128] * __b[159:128]
1615/// result[255:192] := __a[223:192] * __b[223:192]
1616/// \endcode
1617///
1618/// \headerfile <immintrin.h>
1619///
1620/// This intrinsic corresponds to the \c VPMULDQ instruction.
1621///
1622/// \param __a
1623/// A 256-bit vector of [8 x i32] containing one of the source operands.
1624/// \param __b
1625/// A 256-bit vector of [8 x i32] containing one of the source operands.
1626/// \returns A 256-bit vector of [4 x i64] containing the products.
1627static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1628_mm256_mul_epi32(__m256i __a, __m256i __b) {
1629 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1630}
1631
1632/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1633/// [16 x i16], truncates the 32-bit results to the most significant 18
1634/// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1635/// product in the [16 x i16] result.
1636///
1637/// \code{.operation}
1638/// FOR i := 0 TO 15
1639/// j := i*16
1640/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1641/// result[j+15:j] := temp[16:1]
1642/// \endcode
1643///
1644/// \headerfile <immintrin.h>
1645///
1646/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1647///
1648/// \param __a
1649/// A 256-bit vector of [16 x i16] containing one of the source operands.
1650/// \param __b
1651/// A 256-bit vector of [16 x i16] containing one of the source operands.
1652/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1653static __inline__ __m256i __DEFAULT_FN_ATTRS256
1654_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1655{
1656 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1657}
1658
1659/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1660/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1661/// [16 x i16] result.
1662///
1663/// \headerfile <immintrin.h>
1664///
1665/// This intrinsic corresponds to the \c VPMULHUW instruction.
1666///
1667/// \param __a
1668/// A 256-bit vector of [16 x i16] containing one of the source operands.
1669/// \param __b
1670/// A 256-bit vector of [16 x i16] containing one of the source operands.
1671/// \returns A 256-bit vector of [16 x i16] containing the products.
1672static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1673_mm256_mulhi_epu16(__m256i __a, __m256i __b)
1674{
1675 return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
1676}
1677
1678/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1679/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1680/// [16 x i16] result.
1681///
1682/// \headerfile <immintrin.h>
1683///
1684/// This intrinsic corresponds to the \c VPMULHW instruction.
1685///
1686/// \param __a
1687/// A 256-bit vector of [16 x i16] containing one of the source operands.
1688/// \param __b
1689/// A 256-bit vector of [16 x i16] containing one of the source operands.
1690/// \returns A 256-bit vector of [16 x i16] containing the products.
1691static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1692_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1693{
1694 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1695}
1696
1697/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1698/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1699/// [16 x i16] result.
1700///
1701/// \headerfile <immintrin.h>
1702///
1703/// This intrinsic corresponds to the \c VPMULLW instruction.
1704///
1705/// \param __a
1706/// A 256-bit vector of [16 x i16] containing one of the source operands.
1707/// \param __b
1708/// A 256-bit vector of [16 x i16] containing one of the source operands.
1709/// \returns A 256-bit vector of [16 x i16] containing the products.
1710static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1711_mm256_mullo_epi16(__m256i __a, __m256i __b)
1712{
1713 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1714}
1715
1716/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1717/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1718/// [8 x i32] result.
1719///
1720/// \headerfile <immintrin.h>
1721///
1722/// This intrinsic corresponds to the \c VPMULLD instruction.
1723///
1724/// \param __a
1725/// A 256-bit vector of [8 x i32] containing one of the source operands.
1726/// \param __b
1727/// A 256-bit vector of [8 x i32] containing one of the source operands.
1728/// \returns A 256-bit vector of [8 x i32] containing the products.
1729static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1730_mm256_mullo_epi32(__m256i __a, __m256i __b) {
1731 return (__m256i)((__v8su)__a * (__v8su)__b);
1732}
1733
1734/// Multiplies unsigned 32-bit integers from even-numered elements of two
1735/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1736/// [4 x i64] result.
1737///
1738/// \code{.operation}
1739/// result[63:0] := __a[31:0] * __b[31:0]
1740/// result[127:64] := __a[95:64] * __b[95:64]
1741/// result[191:128] := __a[159:128] * __b[159:128]
1742/// result[255:192] := __a[223:192] * __b[223:192]
1743/// \endcode
1744///
1745/// \headerfile <immintrin.h>
1746///
1747/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1748///
1749/// \param __a
1750/// A 256-bit vector of [8 x i32] containing one of the source operands.
1751/// \param __b
1752/// A 256-bit vector of [8 x i32] containing one of the source operands.
1753/// \returns A 256-bit vector of [4 x i64] containing the products.
1754static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1755_mm256_mul_epu32(__m256i __a, __m256i __b) {
1756 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1757}
1758
1759/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1760/// \a __b.
1761///
1762/// \headerfile <immintrin.h>
1763///
1764/// This intrinsic corresponds to the \c VPOR instruction.
1765///
1766/// \param __a
1767/// A 256-bit integer vector.
1768/// \param __b
1769/// A 256-bit integer vector.
1770/// \returns A 256-bit integer vector containing the result.
1771static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1772_mm256_or_si256(__m256i __a, __m256i __b)
1773{
1774 return (__m256i)((__v4du)__a | (__v4du)__b);
1775}
1776
1777/// Computes four sum of absolute difference (SAD) operations on sets of eight
1778/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1779/// \a __b.
1780///
1781/// One SAD result is computed for each set of eight bytes from \a __a and
1782/// eight bytes from \a __b. The zero-extended SAD value is returned in the
1783/// corresponding 64-bit element of the result.
1784///
1785/// A single SAD operation takes the differences between the corresponding
1786/// bytes of \a __a and \a __b, takes the absolute value of each difference,
1787/// and sums these eight values to form one 16-bit result. This operation
1788/// is repeated four times with successive sets of eight bytes.
1789///
1790/// \code{.operation}
1791/// FOR i := 0 TO 3
1792/// j := i*64
1793/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1794/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1795/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1796/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1797/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1798/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1799/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1800/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1801/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1802/// temp4 + temp5 + temp6 + temp7
1803/// result[j+63:j+16] := 0
1804/// ENDFOR
1805/// \endcode
1806///
1807/// \headerfile <immintrin.h>
1808///
1809/// This intrinsic corresponds to the \c VPSADBW instruction.
1810///
1811/// \param __a
1812/// A 256-bit integer vector.
1813/// \param __b
1814/// A 256-bit integer vector.
1815/// \returns A 256-bit integer vector containing the result.
1816static __inline__ __m256i __DEFAULT_FN_ATTRS256
1817_mm256_sad_epu8(__m256i __a, __m256i __b)
1818{
1819 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1820}
1821
1822/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1823/// to control information in the 256-bit integer vector \a __b, and
1824/// returns the 256-bit result. In effect there are two separate 128-bit
1825/// shuffles in the lower and upper halves.
1826///
1827/// \code{.operation}
1828/// FOR i := 0 TO 31
1829/// j := i*8
1830/// IF __b[j+7] == 1
1831/// result[j+7:j] := 0
1832/// ELSE
1833/// k := __b[j+3:j] * 8
1834/// IF i > 15
1835/// k := k + 128
1836/// FI
1837/// result[j+7:j] := __a[k+7:k]
1838/// FI
1839/// ENDFOR
1840/// \endcode
1841///
1842/// \headerfile <immintrin.h>
1843///
1844/// This intrinsic corresponds to the \c VPSHUFB instruction.
1845///
1846/// \param __a
1847/// A 256-bit integer vector containing source values.
1848/// \param __b
1849/// A 256-bit integer vector containing control information to determine
1850/// what goes into the corresponding byte of the result. If bit 7 of the
1851/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1852/// control byte specify the index (within the same 128-bit half) of \a __a
1853/// to copy to the result byte.
1854/// \returns A 256-bit integer vector containing the result.
1855static __inline__ __m256i __DEFAULT_FN_ATTRS256
1856_mm256_shuffle_epi8(__m256i __a, __m256i __b)
1857{
1858 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1859}
1860
1861/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1862/// according to control information in the integer literal \a imm, and
1863/// returns the 256-bit result. In effect there are two parallel 128-bit
1864/// shuffles in the lower and upper halves.
1865///
1866/// \code{.operation}
1867/// FOR i := 0 to 3
1868/// j := i*32
1869/// k := (imm >> i*2)[1:0] * 32
1870/// result[j+31:j] := a[k+31:k]
1871/// result[128+j+31:128+j] := a[128+k+31:128+k]
1872/// ENDFOR
1873/// \endcode
1874///
1875/// \headerfile <immintrin.h>
1876///
1877/// \code
1878/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1879/// \endcode
1880///
1881/// This intrinsic corresponds to the \c VPSHUFB instruction.
1882///
1883/// \param a
1884/// A 256-bit vector of [8 x i32] containing source values.
1885/// \param imm
1886/// An immediate 8-bit value specifying which elements to copy from \a a.
1887/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1888/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1889/// forth.
1890/// \returns A 256-bit vector of [8 x i32] containing the result.
1891#define _mm256_shuffle_epi32(a, imm) \
1892 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1893
1894/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1895/// according to control information in the integer literal \a imm, and
1896/// returns the 256-bit result. The upper 64 bits of each 128-bit half
1897/// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1898/// copied from \a a unchanged.
1899///
1900/// \code{.operation}
1901/// result[63:0] := a[63:0]
1902/// result[191:128] := a[191:128]
1903/// FOR i := 0 TO 3
1904/// j := i * 16 + 64
1905/// k := (imm >> i*2)[1:0] * 16 + 64
1906/// result[j+15:j] := a[k+15:k]
1907/// result[128+j+15:128+j] := a[128+k+15:128+k]
1908/// ENDFOR
1909/// \endcode
1910///
1911/// \headerfile <immintrin.h>
1912///
1913/// \code
1914/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1915/// \endcode
1916///
1917/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1918///
1919/// \param a
1920/// A 256-bit vector of [16 x i16] containing source values.
1921/// \param imm
1922/// An immediate 8-bit value specifying which elements to copy from \a a.
1923/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1924/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1925/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1926/// \returns A 256-bit vector of [16 x i16] containing the result.
1927#define _mm256_shufflehi_epi16(a, imm) \
1928 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1929
1930/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1931/// according to control information in the integer literal \a imm, and
1932/// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1933/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1934/// copied from \a a unchanged.
1935///
1936/// \code{.operation}
1937/// result[127:64] := a[127:64]
1938/// result[255:192] := a[255:192]
1939/// FOR i := 0 TO 3
1940/// j := i * 16
1941/// k := (imm >> i*2)[1:0] * 16
1942/// result[j+15:j] := a[k+15:k]
1943/// result[128+j+15:128+j] := a[128+k+15:128+k]
1944/// ENDFOR
1945/// \endcode
1946///
1947/// \headerfile <immintrin.h>
1948///
1949/// \code
1950/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1951/// \endcode
1952///
1953/// This intrinsic corresponds to the \c VPSHUFLW instruction.
1954///
1955/// \param a
1956/// A 256-bit vector of [16 x i16] to use as a source of data for the
1957/// result.
1958/// \param imm
1959/// An immediate 8-bit value specifying which elements to copy from \a a.
1960/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
1961/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
1962/// forth.
1963/// \returns A 256-bit vector of [16 x i16] containing the result.
1964#define _mm256_shufflelo_epi16(a, imm) \
1965 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
1966
1967/// Sets each byte of the result to the corresponding byte of the 256-bit
1968/// integer vector in \a __a, the negative of that byte, or zero, depending
1969/// on whether the corresponding byte of the 256-bit integer vector in
1970/// \a __b is greater than zero, less than zero, or equal to zero,
1971/// respectively.
1972///
1973/// \headerfile <immintrin.h>
1974///
1975/// This intrinsic corresponds to the \c VPSIGNB instruction.
1976///
1977/// \param __a
1978/// A 256-bit integer vector.
1979/// \param __b
1980/// A 256-bit integer vector].
1981/// \returns A 256-bit integer vector containing the result.
1982static __inline__ __m256i __DEFAULT_FN_ATTRS256
1983_mm256_sign_epi8(__m256i __a, __m256i __b)
1984{
1985 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
1986}
1987
1988/// Sets each element of the result to the corresponding element of the
1989/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
1990/// or zero, depending on whether the corresponding element of the 256-bit
1991/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
1992/// equal to zero, respectively.
1993///
1994/// \headerfile <immintrin.h>
1995///
1996/// This intrinsic corresponds to the \c VPSIGNW instruction.
1997///
1998/// \param __a
1999/// A 256-bit vector of [16 x i16].
2000/// \param __b
2001/// A 256-bit vector of [16 x i16].
2002/// \returns A 256-bit vector of [16 x i16] containing the result.
2003static __inline__ __m256i __DEFAULT_FN_ATTRS256
2004_mm256_sign_epi16(__m256i __a, __m256i __b)
2005{
2006 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2007}
2008
2009/// Sets each element of the result to the corresponding element of the
2010/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2011/// zero, depending on whether the corresponding element of the 256-bit
2012/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2013/// equal to zero, respectively.
2014///
2015/// \headerfile <immintrin.h>
2016///
2017/// This intrinsic corresponds to the \c VPSIGND instruction.
2018///
2019/// \param __a
2020/// A 256-bit vector of [8 x i32].
2021/// \param __b
2022/// A 256-bit vector of [8 x i32].
2023/// \returns A 256-bit vector of [8 x i32] containing the result.
2024static __inline__ __m256i __DEFAULT_FN_ATTRS256
2025_mm256_sign_epi32(__m256i __a, __m256i __b)
2026{
2027 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2028}
2029
2030/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2031/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2032/// is greater than 15, the returned result is all zeroes.
2033///
2034/// \headerfile <immintrin.h>
2035///
2036/// \code
2037/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2038/// \endcode
2039///
2040/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2041///
2042/// \param a
2043/// A 256-bit integer vector to be shifted.
2044/// \param imm
2045/// An unsigned immediate value specifying the shift count (in bytes).
2046/// \returns A 256-bit integer vector containing the result.
2047#define _mm256_slli_si256(a, imm) \
2048 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2049 (int)(imm)))
2050
2051/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2052/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2053/// is greater than 15, the returned result is all zeroes.
2054///
2055/// \headerfile <immintrin.h>
2056///
2057/// \code
2058/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2059/// \endcode
2060///
2061/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2062///
2063/// \param a
2064/// A 256-bit integer vector to be shifted.
2065/// \param imm
2066/// An unsigned immediate value specifying the shift count (in bytes).
2067/// \returns A 256-bit integer vector containing the result.
2068#define _mm256_bslli_epi128(a, imm) \
2069 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2070 (int)(imm)))
2071
2072/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2073/// left by \a __count bits, shifting in zero bits, and returns the result.
2074/// If \a __count is greater than 15, the returned result is all zeroes.
2075///
2076/// \headerfile <immintrin.h>
2077///
2078/// This intrinsic corresponds to the \c VPSLLW instruction.
2079///
2080/// \param __a
2081/// A 256-bit vector of [16 x i16] to be shifted.
2082/// \param __count
2083/// An unsigned integer value specifying the shift count (in bits).
2084/// \returns A 256-bit vector of [16 x i16] containing the result.
2085static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2086_mm256_slli_epi16(__m256i __a, int __count) {
2087 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2088}
2089
2090/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2091/// left by the number of bits specified by the lower 64 bits of \a __count,
2092/// shifting in zero bits, and returns the result. If \a __count is greater
2093/// than 15, the returned result is all zeroes.
2094///
2095/// \headerfile <immintrin.h>
2096///
2097/// This intrinsic corresponds to the \c VPSLLW instruction.
2098///
2099/// \param __a
2100/// A 256-bit vector of [16 x i16] to be shifted.
2101/// \param __count
2102/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2103/// shift count (in bits). The upper element is ignored.
2104/// \returns A 256-bit vector of [16 x i16] containing the result.
2105static __inline__ __m256i __DEFAULT_FN_ATTRS256
2106_mm256_sll_epi16(__m256i __a, __m128i __count)
2107{
2108 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2109}
2110
2111/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2112/// left by \a __count bits, shifting in zero bits, and returns the result.
2113/// If \a __count is greater than 31, the returned result is all zeroes.
2114///
2115/// \headerfile <immintrin.h>
2116///
2117/// This intrinsic corresponds to the \c VPSLLD instruction.
2118///
2119/// \param __a
2120/// A 256-bit vector of [8 x i32] to be shifted.
2121/// \param __count
2122/// An unsigned integer value specifying the shift count (in bits).
2123/// \returns A 256-bit vector of [8 x i32] containing the result.
2124static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2125_mm256_slli_epi32(__m256i __a, int __count) {
2126 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2127}
2128
2129/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2130/// left by the number of bits given in the lower 64 bits of \a __count,
2131/// shifting in zero bits, and returns the result. If \a __count is greater
2132/// than 31, the returned result is all zeroes.
2133///
2134/// \headerfile <immintrin.h>
2135///
2136/// This intrinsic corresponds to the \c VPSLLD instruction.
2137///
2138/// \param __a
2139/// A 256-bit vector of [8 x i32] to be shifted.
2140/// \param __count
2141/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2142/// shift count (in bits). The upper element is ignored.
2143/// \returns A 256-bit vector of [8 x i32] containing the result.
2144static __inline__ __m256i __DEFAULT_FN_ATTRS256
2145_mm256_sll_epi32(__m256i __a, __m128i __count)
2146{
2147 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2148}
2149
2150/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2151/// left by \a __count bits, shifting in zero bits, and returns the result.
2152/// If \a __count is greater than 63, the returned result is all zeroes.
2153///
2154/// \headerfile <immintrin.h>
2155///
2156/// This intrinsic corresponds to the \c VPSLLQ instruction.
2157///
2158/// \param __a
2159/// A 256-bit vector of [4 x i64] to be shifted.
2160/// \param __count
2161/// An unsigned integer value specifying the shift count (in bits).
2162/// \returns A 256-bit vector of [4 x i64] containing the result.
2163static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2164_mm256_slli_epi64(__m256i __a, int __count) {
2165 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2166}
2167
2168/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2169/// left by the number of bits given in the lower 64 bits of \a __count,
2170/// shifting in zero bits, and returns the result. If \a __count is greater
2171/// than 63, the returned result is all zeroes.
2172///
2173/// \headerfile <immintrin.h>
2174///
2175/// This intrinsic corresponds to the \c VPSLLQ instruction.
2176///
2177/// \param __a
2178/// A 256-bit vector of [4 x i64] to be shifted.
2179/// \param __count
2180/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2181/// shift count (in bits). The upper element is ignored.
2182/// \returns A 256-bit vector of [4 x i64] containing the result.
2183static __inline__ __m256i __DEFAULT_FN_ATTRS256
2184_mm256_sll_epi64(__m256i __a, __m128i __count)
2185{
2186 return __builtin_ia32_psllq256((__v4di)__a, __count);
2187}
2188
2189/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2190/// right by \a __count bits, shifting in sign bits, and returns the result.
2191/// If \a __count is greater than 15, each element of the result is either
2192/// 0 or -1 according to the corresponding input sign bit.
2193///
2194/// \headerfile <immintrin.h>
2195///
2196/// This intrinsic corresponds to the \c VPSRAW instruction.
2197///
2198/// \param __a
2199/// A 256-bit vector of [16 x i16] to be shifted.
2200/// \param __count
2201/// An unsigned integer value specifying the shift count (in bits).
2202/// \returns A 256-bit vector of [16 x i16] containing the result.
2203static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2204_mm256_srai_epi16(__m256i __a, int __count) {
2205 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2206}
2207
2208/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2209/// right by the number of bits given in the lower 64 bits of \a __count,
2210/// shifting in sign bits, and returns the result. If \a __count is greater
2211/// than 15, each element of the result is either 0 or -1 according to the
2212/// corresponding input sign bit.
2213///
2214/// \headerfile <immintrin.h>
2215///
2216/// This intrinsic corresponds to the \c VPSRAW instruction.
2217///
2218/// \param __a
2219/// A 256-bit vector of [16 x i16] to be shifted.
2220/// \param __count
2221/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2222/// shift count (in bits). The upper element is ignored.
2223/// \returns A 256-bit vector of [16 x i16] containing the result.
2224static __inline__ __m256i __DEFAULT_FN_ATTRS256
2225_mm256_sra_epi16(__m256i __a, __m128i __count)
2226{
2227 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2228}
2229
2230/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2231/// right by \a __count bits, shifting in sign bits, and returns the result.
2232/// If \a __count is greater than 31, each element of the result is either
2233/// 0 or -1 according to the corresponding input sign bit.
2234///
2235/// \headerfile <immintrin.h>
2236///
2237/// This intrinsic corresponds to the \c VPSRAD instruction.
2238///
2239/// \param __a
2240/// A 256-bit vector of [8 x i32] to be shifted.
2241/// \param __count
2242/// An unsigned integer value specifying the shift count (in bits).
2243/// \returns A 256-bit vector of [8 x i32] containing the result.
2244static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2245_mm256_srai_epi32(__m256i __a, int __count) {
2246 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2247}
2248
2249/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2250/// right by the number of bits given in the lower 64 bits of \a __count,
2251/// shifting in sign bits, and returns the result. If \a __count is greater
2252/// than 31, each element of the result is either 0 or -1 according to the
2253/// corresponding input sign bit.
2254///
2255/// \headerfile <immintrin.h>
2256///
2257/// This intrinsic corresponds to the \c VPSRAD instruction.
2258///
2259/// \param __a
2260/// A 256-bit vector of [8 x i32] to be shifted.
2261/// \param __count
2262/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2263/// shift count (in bits). The upper element is ignored.
2264/// \returns A 256-bit vector of [8 x i32] containing the result.
2265static __inline__ __m256i __DEFAULT_FN_ATTRS256
2266_mm256_sra_epi32(__m256i __a, __m128i __count)
2267{
2268 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2269}
2270
2271/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2272/// \a imm bytes, shifting in zero bytes, and returns the result. If
2273/// \a imm is greater than 15, the returned result is all zeroes.
2274///
2275/// \headerfile <immintrin.h>
2276///
2277/// \code
2278/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2279/// \endcode
2280///
2281/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2282///
2283/// \param a
2284/// A 256-bit integer vector to be shifted.
2285/// \param imm
2286/// An unsigned immediate value specifying the shift count (in bytes).
2287/// \returns A 256-bit integer vector containing the result.
2288#define _mm256_srli_si256(a, imm) \
2289 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2290 (int)(imm)))
2291
2292/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2293/// \a imm bytes, shifting in zero bytes, and returns the result. If
2294/// \a imm is greater than 15, the returned result is all zeroes.
2295///
2296/// \headerfile <immintrin.h>
2297///
2298/// \code
2299/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2300/// \endcode
2301///
2302/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2303///
2304/// \param a
2305/// A 256-bit integer vector to be shifted.
2306/// \param imm
2307/// An unsigned immediate value specifying the shift count (in bytes).
2308/// \returns A 256-bit integer vector containing the result.
2309#define _mm256_bsrli_epi128(a, imm) \
2310 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2311 (int)(imm)))
2312
2313/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2314/// right by \a __count bits, shifting in zero bits, and returns the result.
2315/// If \a __count is greater than 15, the returned result is all zeroes.
2316///
2317/// \headerfile <immintrin.h>
2318///
2319/// This intrinsic corresponds to the \c VPSRLW instruction.
2320///
2321/// \param __a
2322/// A 256-bit vector of [16 x i16] to be shifted.
2323/// \param __count
2324/// An unsigned integer value specifying the shift count (in bits).
2325/// \returns A 256-bit vector of [16 x i16] containing the result.
2326static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2327_mm256_srli_epi16(__m256i __a, int __count) {
2328 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2329}
2330
2331/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2332/// right by the number of bits given in the lower 64 bits of \a __count,
2333/// shifting in zero bits, and returns the result. If \a __count is greater
2334/// than 15, the returned result is all zeroes.
2335///
2336/// \headerfile <immintrin.h>
2337///
2338/// This intrinsic corresponds to the \c VPSRLW instruction.
2339///
2340/// \param __a
2341/// A 256-bit vector of [16 x i16] to be shifted.
2342/// \param __count
2343/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2344/// shift count (in bits). The upper element is ignored.
2345/// \returns A 256-bit vector of [16 x i16] containing the result.
2346static __inline__ __m256i __DEFAULT_FN_ATTRS256
2347_mm256_srl_epi16(__m256i __a, __m128i __count)
2348{
2349 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2350}
2351
2352/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2353/// right by \a __count bits, shifting in zero bits, and returns the result.
2354/// If \a __count is greater than 31, the returned result is all zeroes.
2355///
2356/// \headerfile <immintrin.h>
2357///
2358/// This intrinsic corresponds to the \c VPSRLD instruction.
2359///
2360/// \param __a
2361/// A 256-bit vector of [8 x i32] to be shifted.
2362/// \param __count
2363/// An unsigned integer value specifying the shift count (in bits).
2364/// \returns A 256-bit vector of [8 x i32] containing the result.
2365static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2366_mm256_srli_epi32(__m256i __a, int __count) {
2367 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2368}
2369
2370/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2371/// right by the number of bits given in the lower 64 bits of \a __count,
2372/// shifting in zero bits, and returns the result. If \a __count is greater
2373/// than 31, the returned result is all zeroes.
2374///
2375/// \headerfile <immintrin.h>
2376///
2377/// This intrinsic corresponds to the \c VPSRLD instruction.
2378///
2379/// \param __a
2380/// A 256-bit vector of [8 x i32] to be shifted.
2381/// \param __count
2382/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2383/// shift count (in bits). The upper element is ignored.
2384/// \returns A 256-bit vector of [8 x i32] containing the result.
2385static __inline__ __m256i __DEFAULT_FN_ATTRS256
2386_mm256_srl_epi32(__m256i __a, __m128i __count)
2387{
2388 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2389}
2390
2391/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2392/// right by \a __count bits, shifting in zero bits, and returns the result.
2393/// If \a __count is greater than 63, the returned result is all zeroes.
2394///
2395/// \headerfile <immintrin.h>
2396///
2397/// This intrinsic corresponds to the \c VPSRLQ instruction.
2398///
2399/// \param __a
2400/// A 256-bit vector of [4 x i64] to be shifted.
2401/// \param __count
2402/// An unsigned integer value specifying the shift count (in bits).
2403/// \returns A 256-bit vector of [4 x i64] containing the result.
2404static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2405_mm256_srli_epi64(__m256i __a, int __count) {
2406 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2407}
2408
2409/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2410/// right by the number of bits given in the lower 64 bits of \a __count,
2411/// shifting in zero bits, and returns the result. If \a __count is greater
2412/// than 63, the returned result is all zeroes.
2413///
2414/// \headerfile <immintrin.h>
2415///
2416/// This intrinsic corresponds to the \c VPSRLQ instruction.
2417///
2418/// \param __a
2419/// A 256-bit vector of [4 x i64] to be shifted.
2420/// \param __count
2421/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2422/// shift count (in bits). The upper element is ignored.
2423/// \returns A 256-bit vector of [4 x i64] containing the result.
2424static __inline__ __m256i __DEFAULT_FN_ATTRS256
2425_mm256_srl_epi64(__m256i __a, __m128i __count)
2426{
2427 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2428}
2429
2430/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2431/// vectors. Returns the lower 8 bits of each difference in the
2432/// corresponding byte of the 256-bit integer vector result (overflow is
2433/// ignored).
2434///
2435/// \code{.operation}
2436/// FOR i := 0 TO 31
2437/// j := i*8
2438/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2439/// ENDFOR
2440/// \endcode
2441///
2442/// \headerfile <immintrin.h>
2443///
2444/// This intrinsic corresponds to the \c VPSUBB instruction.
2445///
2446/// \param __a
2447/// A 256-bit integer vector containing the minuends.
2448/// \param __b
2449/// A 256-bit integer vector containing the subtrahends.
2450/// \returns A 256-bit integer vector containing the differences.
2451static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2452_mm256_sub_epi8(__m256i __a, __m256i __b) {
2453 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2454}
2455
2456/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2457/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2458/// the corresponding element of the [16 x i16] result (overflow is
2459/// ignored).
2460///
2461/// \code{.operation}
2462/// FOR i := 0 TO 15
2463/// j := i*16
2464/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2465/// ENDFOR
2466/// \endcode
2467///
2468/// \headerfile <immintrin.h>
2469///
2470/// This intrinsic corresponds to the \c VPSUBW instruction.
2471///
2472/// \param __a
2473/// A 256-bit vector of [16 x i16] containing the minuends.
2474/// \param __b
2475/// A 256-bit vector of [16 x i16] containing the subtrahends.
2476/// \returns A 256-bit vector of [16 x i16] containing the differences.
2477static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2478_mm256_sub_epi16(__m256i __a, __m256i __b) {
2479 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2480}
2481
2482/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2483/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2484/// the corresponding element of the [8 x i32] result (overflow is ignored).
2485///
2486/// \code{.operation}
2487/// FOR i := 0 TO 7
2488/// j := i*32
2489/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2490/// ENDFOR
2491/// \endcode
2492///
2493/// \headerfile <immintrin.h>
2494///
2495/// This intrinsic corresponds to the \c VPSUBD instruction.
2496///
2497/// \param __a
2498/// A 256-bit vector of [8 x i32] containing the minuends.
2499/// \param __b
2500/// A 256-bit vector of [8 x i32] containing the subtrahends.
2501/// \returns A 256-bit vector of [8 x i32] containing the differences.
2502static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2503_mm256_sub_epi32(__m256i __a, __m256i __b) {
2504 return (__m256i)((__v8su)__a - (__v8su)__b);
2505}
2506
2507/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2508/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2509/// the corresponding element of the [4 x i64] result (overflow is ignored).
2510///
2511/// \code{.operation}
2512/// FOR i := 0 TO 3
2513/// j := i*64
2514/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2515/// ENDFOR
2516/// \endcode
2517///
2518/// \headerfile <immintrin.h>
2519///
2520/// This intrinsic corresponds to the \c VPSUBQ instruction.
2521///
2522/// \param __a
2523/// A 256-bit vector of [4 x i64] containing the minuends.
2524/// \param __b
2525/// A 256-bit vector of [4 x i64] containing the subtrahends.
2526/// \returns A 256-bit vector of [4 x i64] containing the differences.
2527static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2528_mm256_sub_epi64(__m256i __a, __m256i __b) {
2529 return (__m256i)((__v4du)__a - (__v4du)__b);
2530}
2531
2532/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2533/// vectors using signed saturation, and returns each differences in the
2534/// corresponding byte of the 256-bit integer vector result.
2535///
2536/// \code{.operation}
2537/// FOR i := 0 TO 31
2538/// j := i*8
2539/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2540/// ENDFOR
2541/// \endcode
2542///
2543/// \headerfile <immintrin.h>
2544///
2545/// This intrinsic corresponds to the \c VPSUBSB instruction.
2546///
2547/// \param __a
2548/// A 256-bit integer vector containing the minuends.
2549/// \param __b
2550/// A 256-bit integer vector containing the subtrahends.
2551/// \returns A 256-bit integer vector containing the differences.
2552static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2553_mm256_subs_epi8(__m256i __a, __m256i __b) {
2554 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2555}
2556
2557/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2558/// vectors of [16 x i16] using signed saturation, and returns each
2559/// difference in the corresponding element of the [16 x i16] result.
2560///
2561/// \code{.operation}
2562/// FOR i := 0 TO 15
2563/// j := i*16
2564/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2565/// ENDFOR
2566/// \endcode
2567///
2568/// \headerfile <immintrin.h>
2569///
2570/// This intrinsic corresponds to the \c VPSUBSW instruction.
2571///
2572/// \param __a
2573/// A 256-bit vector of [16 x i16] containing the minuends.
2574/// \param __b
2575/// A 256-bit vector of [16 x i16] containing the subtrahends.
2576/// \returns A 256-bit vector of [16 x i16] containing the differences.
2577static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2578_mm256_subs_epi16(__m256i __a, __m256i __b) {
2579 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2580}
2581
2582/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2583/// vectors using unsigned saturation, and returns each difference in the
2584/// corresponding byte of the 256-bit integer vector result. For each byte,
2585/// computes <c> result = __a - __b </c>.
2586///
2587/// \code{.operation}
2588/// FOR i := 0 TO 31
2589/// j := i*8
2590/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2591/// ENDFOR
2592/// \endcode
2593///
2594/// \headerfile <immintrin.h>
2595///
2596/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2597///
2598/// \param __a
2599/// A 256-bit integer vector containing the minuends.
2600/// \param __b
2601/// A 256-bit integer vector containing the subtrahends.
2602/// \returns A 256-bit integer vector containing the differences.
2603static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2604_mm256_subs_epu8(__m256i __a, __m256i __b) {
2605 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2606}
2607
2608/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2609/// vectors of [16 x i16] using unsigned saturation, and returns each
2610/// difference in the corresponding element of the [16 x i16] result.
2611///
2612/// \code{.operation}
2613/// FOR i := 0 TO 15
2614/// j := i*16
2615/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2616/// ENDFOR
2617/// \endcode
2618///
2619/// \headerfile <immintrin.h>
2620///
2621/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2622///
2623/// \param __a
2624/// A 256-bit vector of [16 x i16] containing the minuends.
2625/// \param __b
2626/// A 256-bit vector of [16 x i16] containing the subtrahends.
2627/// \returns A 256-bit vector of [16 x i16] containing the differences.
2628static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2629_mm256_subs_epu16(__m256i __a, __m256i __b) {
2630 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2631}
2632
2633/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2634/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2635/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2636/// input; other bits in these parameters are ignored.
2637///
2638/// \code{.operation}
2639/// result[7:0] := __a[71:64]
2640/// result[15:8] := __b[71:64]
2641/// result[23:16] := __a[79:72]
2642/// result[31:24] := __b[79:72]
2643/// . . .
2644/// result[127:120] := __b[127:120]
2645/// result[135:128] := __a[199:192]
2646/// . . .
2647/// result[255:248] := __b[255:248]
2648/// \endcode
2649///
2650/// \headerfile <immintrin.h>
2651///
2652/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2653///
2654/// \param __a
2655/// A 256-bit integer vector used as the source for the even-numbered bytes
2656/// of the result.
2657/// \param __b
2658/// A 256-bit integer vector used as the source for the odd-numbered bytes
2659/// of the result.
2660/// \returns A 256-bit integer vector containing the result.
2661static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2662_mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
2663 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2664}
2665
2666/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2667/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2668/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2669/// 128-bit half of \a __a and \a __b as input; other bits in these
2670/// parameters are ignored.
2671///
2672/// \code{.operation}
2673/// result[15:0] := __a[79:64]
2674/// result[31:16] := __b[79:64]
2675/// result[47:32] := __a[95:80]
2676/// result[63:48] := __b[95:80]
2677/// . . .
2678/// result[127:112] := __b[127:112]
2679/// result[143:128] := __a[211:196]
2680/// . . .
2681/// result[255:240] := __b[255:240]
2682/// \endcode
2683///
2684/// \headerfile <immintrin.h>
2685///
2686/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2687///
2688/// \param __a
2689/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2690/// elements of the result.
2691/// \param __b
2692/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2693/// elements of the result.
2694/// \returns A 256-bit vector of [16 x i16] containing the result.
2695static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2696_mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
2697 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2698}
2699
2700/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2701/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2702/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2703/// of \a __a and \a __b as input; other bits in these parameters are
2704/// ignored.
2705///
2706/// \code{.operation}
2707/// result[31:0] := __a[95:64]
2708/// result[63:32] := __b[95:64]
2709/// result[95:64] := __a[127:96]
2710/// result[127:96] := __b[127:96]
2711/// result[159:128] := __a[223:192]
2712/// result[191:160] := __b[223:192]
2713/// result[223:192] := __a[255:224]
2714/// result[255:224] := __b[255:224]
2715/// \endcode
2716///
2717/// \headerfile <immintrin.h>
2718///
2719/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2720///
2721/// \param __a
2722/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2723/// elements of the result.
2724/// \param __b
2725/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2726/// elements of the result.
2727/// \returns A 256-bit vector of [8 x i32] containing the result.
2728static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2729_mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
2730 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2731}
2732
2733/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2734/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2735/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2736/// of \a __a and \a __b as input; other bits in these parameters are
2737/// ignored.
2738///
2739/// \code{.operation}
2740/// result[63:0] := __a[127:64]
2741/// result[127:64] := __b[127:64]
2742/// result[191:128] := __a[255:192]
2743/// result[255:192] := __b[255:192]
2744/// \endcode
2745///
2746/// \headerfile <immintrin.h>
2747///
2748/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2749///
2750/// \param __a
2751/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2752/// elements of the result.
2753/// \param __b
2754/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2755/// elements of the result.
2756/// \returns A 256-bit vector of [4 x i64] containing the result.
2757static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2758_mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
2759 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2760}
2761
2762/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2763/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2764/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2765/// input; other bits in these parameters are ignored.
2766///
2767/// \code{.operation}
2768/// result[7:0] := __a[7:0]
2769/// result[15:8] := __b[7:0]
2770/// result[23:16] := __a[15:8]
2771/// result[31:24] := __b[15:8]
2772/// . . .
2773/// result[127:120] := __b[63:56]
2774/// result[135:128] := __a[135:128]
2775/// . . .
2776/// result[255:248] := __b[191:184]
2777/// \endcode
2778///
2779/// \headerfile <immintrin.h>
2780///
2781/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2782///
2783/// \param __a
2784/// A 256-bit integer vector used as the source for the even-numbered bytes
2785/// of the result.
2786/// \param __b
2787/// A 256-bit integer vector used as the source for the odd-numbered bytes
2788/// of the result.
2789/// \returns A 256-bit integer vector containing the result.
2790static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2791_mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
2792 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2793}
2794
2795/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2796/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2797/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2798/// 128-bit half of \a __a and \a __b as input; other bits in these
2799/// parameters are ignored.
2800///
2801/// \code{.operation}
2802/// result[15:0] := __a[15:0]
2803/// result[31:16] := __b[15:0]
2804/// result[47:32] := __a[31:16]
2805/// result[63:48] := __b[31:16]
2806/// . . .
2807/// result[127:112] := __b[63:48]
2808/// result[143:128] := __a[143:128]
2809/// . . .
2810/// result[255:239] := __b[191:176]
2811/// \endcode
2812///
2813/// \headerfile <immintrin.h>
2814///
2815/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2816///
2817/// \param __a
2818/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2819/// elements of the result.
2820/// \param __b
2821/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2822/// elements of the result.
2823/// \returns A 256-bit vector of [16 x i16] containing the result.
2824static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2825_mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
2826 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2827}
2828
2829/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2830/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2831/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2832/// of \a __a and \a __b as input; other bits in these parameters are
2833/// ignored.
2834///
2835/// \code{.operation}
2836/// result[31:0] := __a[31:0]
2837/// result[63:32] := __b[31:0]
2838/// result[95:64] := __a[63:32]
2839/// result[127:96] := __b[63:32]
2840/// result[159:128] := __a[159:128]
2841/// result[191:160] := __b[159:128]
2842/// result[223:192] := __a[191:160]
2843/// result[255:224] := __b[191:190]
2844/// \endcode
2845///
2846/// \headerfile <immintrin.h>
2847///
2848/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2849///
2850/// \param __a
2851/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2852/// elements of the result.
2853/// \param __b
2854/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2855/// elements of the result.
2856/// \returns A 256-bit vector of [8 x i32] containing the result.
2857static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2858_mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
2859 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2860}
2861
2862/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2863/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2864/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2865/// of \a __a and \a __b as input; other bits in these parameters are
2866/// ignored.
2867///
2868/// \code{.operation}
2869/// result[63:0] := __a[63:0]
2870/// result[127:64] := __b[63:0]
2871/// result[191:128] := __a[191:128]
2872/// result[255:192] := __b[191:128]
2873/// \endcode
2874///
2875/// \headerfile <immintrin.h>
2876///
2877/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2878///
2879/// \param __a
2880/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2881/// elements of the result.
2882/// \param __b
2883/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2884/// elements of the result.
2885/// \returns A 256-bit vector of [4 x i64] containing the result.
2886static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2887_mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
2888 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2889}
2890
2891/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2892/// \a __b.
2893///
2894/// \headerfile <immintrin.h>
2895///
2896/// This intrinsic corresponds to the \c VPXOR instruction.
2897///
2898/// \param __a
2899/// A 256-bit integer vector.
2900/// \param __b
2901/// A 256-bit integer vector.
2902/// \returns A 256-bit integer vector containing the result.
2903static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2904_mm256_xor_si256(__m256i __a, __m256i __b)
2905{
2906 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2907}
2908
2909/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2910/// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2911/// boundary.
2912///
2913/// \headerfile <immintrin.h>
2914///
2915/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2916///
2917/// \param __V
2918/// A pointer to the 32-byte aligned memory containing the vector to load.
2919/// \returns A 256-bit integer vector loaded from memory.
2920static __inline__ __m256i __DEFAULT_FN_ATTRS256
2922{
2923 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2924 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2925}
2926
2927/// Broadcasts the 32-bit floating-point value from the low element of the
2928/// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2929/// 128-bit vector of [4 x float].
2930///
2931/// \headerfile <immintrin.h>
2932///
2933/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2934///
2935/// \param __X
2936/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2937/// \returns A 128-bit vector of [4 x float] containing the result.
2938static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
2940 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
2941}
2942
2943/// Broadcasts the 64-bit floating-point value from the low element of the
2944/// 128-bit vector of [2 x double] in \a __a to both elements of the
2945/// result's 128-bit vector of [2 x double].
2946///
2947/// \headerfile <immintrin.h>
2948///
2949/// This intrinsic corresponds to the \c MOVDDUP instruction.
2950///
2951/// \param __a
2952/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2953/// \returns A 128-bit vector of [2 x double] containing the result.
2954static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
2956 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
2957}
2958
2959/// Broadcasts the 32-bit floating-point value from the low element of the
2960/// 128-bit vector of [4 x float] in \a __X to all elements of the
2961/// result's 256-bit vector of [8 x float].
2962///
2963/// \headerfile <immintrin.h>
2964///
2965/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2966///
2967/// \param __X
2968/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2969/// \returns A 256-bit vector of [8 x float] containing the result.
2970static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
2972 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2973}
2974
2975/// Broadcasts the 64-bit floating-point value from the low element of the
2976/// 128-bit vector of [2 x double] in \a __X to all elements of the
2977/// result's 256-bit vector of [4 x double].
2978///
2979/// \headerfile <immintrin.h>
2980///
2981/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
2982///
2983/// \param __X
2984/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2985/// \returns A 256-bit vector of [4 x double] containing the result.
2986static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
2988 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
2989}
2990
2991/// Broadcasts the 128-bit integer data from \a __X to both the lower and
2992/// upper halves of the 256-bit result.
2993///
2994/// \headerfile <immintrin.h>
2995///
2996/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
2997///
2998/// \param __X
2999/// A 128-bit integer vector to be broadcast.
3000/// \returns A 256-bit integer vector containing the result.
3001static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3003 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3004}
3005
3006#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3007
3008/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3009/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3010/// as specified by the immediate integer operand \a M.
3011///
3012/// \code{.operation}
3013/// FOR i := 0 TO 3
3014/// j := i*32
3015/// IF M[i] == 0
3016/// result[31+j:j] := V1[31+j:j]
3017/// ELSE
3018/// result[31+j:j] := V2[32+j:j]
3019/// FI
3020/// ENDFOR
3021/// \endcode
3022///
3023/// \headerfile <immintrin.h>
3024///
3025/// \code
3026/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3027/// \endcode
3028///
3029/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3030///
3031/// \param V1
3032/// A 128-bit vector of [4 x i32] containing source values.
3033/// \param V2
3034/// A 128-bit vector of [4 x i32] containing source values.
3035/// \param M
3036/// An immediate 8-bit integer operand, with bits [3:0] specifying the
3037/// source for each element of the result. The position of the mask bit
3038/// corresponds to the index of a copied value. When a mask bit is 0, the
3039/// element is copied from \a V1; otherwise, it is copied from \a V2.
3040/// \returns A 128-bit vector of [4 x i32] containing the result.
3041#define _mm_blend_epi32(V1, V2, M) \
3042 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3043 (__v4si)(__m128i)(V2), (int)(M)))
3044
3045/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3046/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3047/// as specified by the immediate integer operand \a M.
3048///
3049/// \code{.operation}
3050/// FOR i := 0 TO 7
3051/// j := i*32
3052/// IF M[i] == 0
3053/// result[31+j:j] := V1[31+j:j]
3054/// ELSE
3055/// result[31+j:j] := V2[32+j:j]
3056/// FI
3057/// ENDFOR
3058/// \endcode
3059///
3060/// \headerfile <immintrin.h>
3061///
3062/// \code
3063/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3064/// \endcode
3065///
3066/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3067///
3068/// \param V1
3069/// A 256-bit vector of [8 x i32] containing source values.
3070/// \param V2
3071/// A 256-bit vector of [8 x i32] containing source values.
3072/// \param M
3073/// An immediate 8-bit integer operand, with bits [7:0] specifying the
3074/// source for each element of the result. The position of the mask bit
3075/// corresponds to the index of a copied value. When a mask bit is 0, the
3076/// element is copied from \a V1; otherwise, it is is copied from \a V2.
3077/// \returns A 256-bit vector of [8 x i32] containing the result.
3078#define _mm256_blend_epi32(V1, V2, M) \
3079 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3080 (__v8si)(__m256i)(V2), (int)(M)))
3081
3082/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3083/// bytes of the 256-bit result.
3084///
3085/// \headerfile <immintrin.h>
3086///
3087/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3088///
3089/// \param __X
3090/// A 128-bit integer vector whose low byte will be broadcast.
3091/// \returns A 256-bit integer vector containing the result.
3092static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3094 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3095}
3096
3097/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3098/// to all elements of the result's 256-bit vector of [16 x i16].
3099///
3100/// \headerfile <immintrin.h>
3101///
3102/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3103///
3104/// \param __X
3105/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3106/// \returns A 256-bit vector of [16 x i16] containing the result.
3107static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3109 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3110}
3111
3112/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3113/// to all elements of the result's 256-bit vector of [8 x i32].
3114///
3115/// \headerfile <immintrin.h>
3116///
3117/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3118///
3119/// \param __X
3120/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3121/// \returns A 256-bit vector of [8 x i32] containing the result.
3122static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3124 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3125}
3126
3127/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3128/// to all elements of the result's 256-bit vector of [4 x i64].
3129///
3130/// \headerfile <immintrin.h>
3131///
3132/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3133///
3134/// \param __X
3135/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3136/// \returns A 256-bit vector of [4 x i64] containing the result.
3137static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3139 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3140}
3141
3142/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3143/// bytes of the 128-bit result.
3144///
3145/// \headerfile <immintrin.h>
3146///
3147/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3148///
3149/// \param __X
3150/// A 128-bit integer vector whose low byte will be broadcast.
3151/// \returns A 128-bit integer vector containing the result.
3152static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3154 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3155}
3156
3157/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3158/// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3159///
3160/// \headerfile <immintrin.h>
3161///
3162/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3163///
3164/// \param __X
3165/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3166/// \returns A 128-bit vector of [8 x i16] containing the result.
3167static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3169 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3170}
3171
3172/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3173/// to all elements of the result's vector of [4 x i32].
3174///
3175/// \headerfile <immintrin.h>
3176///
3177/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3178///
3179/// \param __X
3180/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3181/// \returns A 128-bit vector of [4 x i32] containing the result.
3182static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3184 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3185}
3186
3187/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3188/// to both elements of the result's 128-bit vector of [2 x i64].
3189///
3190/// \headerfile <immintrin.h>
3191///
3192/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3193///
3194/// \param __X
3195/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3196/// \returns A 128-bit vector of [2 x i64] containing the result.
3197static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3199 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3200}
3201
3202/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3203/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3204/// elements of the 256-bit vector of [8 x i32] in \a __b.
3205///
3206/// \code{.operation}
3207/// FOR i := 0 TO 7
3208/// j := i*32
3209/// k := __b[j+2:j] * 32
3210/// result[j+31:j] := __a[k+31:k]
3211/// ENDFOR
3212/// \endcode
3213///
3214/// \headerfile <immintrin.h>
3215///
3216/// This intrinsic corresponds to the \c VPERMD instruction.
3217///
3218/// \param __a
3219/// A 256-bit vector of [8 x i32] containing the source values.
3220/// \param __b
3221/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3222/// \a __a.
3223/// \returns A 256-bit vector of [8 x i32] containing the result.
3224static __inline__ __m256i __DEFAULT_FN_ATTRS256
3226{
3227 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3228}
3229
3230/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3231/// the 256-bit vector of [4 x double] in \a V as specified by the
3232/// immediate value \a M.
3233///
3234/// \code{.operation}
3235/// FOR i := 0 TO 3
3236/// j := i*64
3237/// k := (M >> i*2)[1:0] * 64
3238/// result[j+63:j] := V[k+63:k]
3239/// ENDFOR
3240/// \endcode
3241///
3242/// \headerfile <immintrin.h>
3243///
3244/// \code
3245/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3246/// \endcode
3247///
3248/// This intrinsic corresponds to the \c VPERMPD instruction.
3249///
3250/// \param V
3251/// A 256-bit vector of [4 x double] containing the source values.
3252/// \param M
3253/// An immediate 8-bit value specifying which elements to copy from \a V.
3254/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3255/// \a M[3:2] specifies the index for element 1, and so forth.
3256/// \returns A 256-bit vector of [4 x double] containing the result.
3257#define _mm256_permute4x64_pd(V, M) \
3258 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3259
3260/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3261/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3262/// the elements of the 256-bit vector of [8 x i32] in \a __b.
3263///
3264/// \code{.operation}
3265/// FOR i := 0 TO 7
3266/// j := i*32
3267/// k := __b[j+2:j] * 32
3268/// result[j+31:j] := __a[k+31:k]
3269/// ENDFOR
3270/// \endcode
3271///
3272/// \headerfile <immintrin.h>
3273///
3274/// This intrinsic corresponds to the \c VPERMPS instruction.
3275///
3276/// \param __a
3277/// A 256-bit vector of [8 x float] containing the source values.
3278/// \param __b
3279/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3280/// \a __a.
3281/// \returns A 256-bit vector of [8 x float] containing the result.
3282static __inline__ __m256 __DEFAULT_FN_ATTRS256
3284{
3285 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3286}
3287
3288/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3289/// of the 256-bit vector of [4 x i64] in \a V as specified by the
3290/// immediate value \a M.
3291///
3292/// \code{.operation}
3293/// FOR i := 0 TO 3
3294/// j := i*64
3295/// k := (M >> i*2)[1:0] * 64
3296/// result[j+63:j] := V[k+63:k]
3297/// ENDFOR
3298/// \endcode
3299///
3300/// \headerfile <immintrin.h>
3301///
3302/// \code
3303/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3304/// \endcode
3305///
3306/// This intrinsic corresponds to the \c VPERMQ instruction.
3307///
3308/// \param V
3309/// A 256-bit vector of [4 x i64] containing the source values.
3310/// \param M
3311/// An immediate 8-bit value specifying which elements to copy from \a V.
3312/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3313/// \a M[3:2] specifies the index for element 1, and so forth.
3314/// \returns A 256-bit vector of [4 x i64] containing the result.
3315#define _mm256_permute4x64_epi64(V, M) \
3316 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3317
3318/// Sets each half of the 256-bit result either to zero or to one of the
3319/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3320/// as specified by the immediate value \a M.
3321///
3322/// \code{.operation}
3323/// FOR i := 0 TO 1
3324/// j := i*128
3325/// k := M >> (i*4)
3326/// IF k[3] == 0
3327/// CASE (k[1:0]) OF
3328/// 0: result[127+j:j] := V1[127:0]
3329/// 1: result[127+j:j] := V1[255:128]
3330/// 2: result[127+j:j] := V2[127:0]
3331/// 3: result[127+j:j] := V2[255:128]
3332/// ESAC
3333/// ELSE
3334/// result[127+j:j] := 0
3335/// FI
3336/// ENDFOR
3337/// \endcode
3338///
3339/// \headerfile <immintrin.h>
3340///
3341/// \code
3342/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3343/// \endcode
3344///
3345/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3346///
3347/// \param V1
3348/// A 256-bit integer vector containing source values.
3349/// \param V2
3350/// A 256-bit integer vector containing source values.
3351/// \param M
3352/// An immediate value specifying how to form the result. Bits [3:0]
3353/// control the lower half of the result, bits [7:4] control the upper half.
3354/// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3355/// otherwise bits [1:0] determine the source as follows. \n
3356/// 0: the lower half of \a V1 \n
3357/// 1: the upper half of \a V1 \n
3358/// 2: the lower half of \a V2 \n
3359/// 3: the upper half of \a V2
3360/// \returns A 256-bit integer vector containing the result.
3361#define _mm256_permute2x128_si256(V1, V2, M) \
3362 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3363
3364/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3365/// of the immediate \a M is zero, extracts the lower half of the result;
3366/// otherwise, extracts the upper half.
3367///
3368/// \headerfile <immintrin.h>
3369///
3370/// \code
3371/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3372/// \endcode
3373///
3374/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3375///
3376/// \param V
3377/// A 256-bit integer vector containing the source values.
3378/// \param M
3379/// An immediate value specifying which half of \a V to extract.
3380/// \returns A 128-bit integer vector containing the result.
3381#define _mm256_extracti128_si256(V, M) \
3382 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3383
3384/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3385/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3386/// is zero, overwrites the lower half of the result; otherwise,
3387/// overwrites the upper half.
3388///
3389/// \headerfile <immintrin.h>
3390///
3391/// \code
3392/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3393/// \endcode
3394///
3395/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3396///
3397/// \param V1
3398/// A 256-bit integer vector containing a source value.
3399/// \param V2
3400/// A 128-bit integer vector containing a source value.
3401/// \param M
3402/// An immediate value specifying where to put \a V2 in the result.
3403/// \returns A 256-bit integer vector containing the result.
3404#define _mm256_inserti128_si256(V1, V2, M) \
3405 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3406 (__v2di)(__m128i)(V2), (int)(M)))
3407
3408/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3409/// the most significant bit of the corresponding element in the mask
3410/// \a __M is set; otherwise, sets that element of the result to zero.
3411/// Returns the 256-bit [8 x i32] result.
3412///
3413/// \code{.operation}
3414/// FOR i := 0 TO 7
3415/// j := i*32
3416/// IF __M[j+31] == 1
3417/// result[j+31:j] := Load32(__X+(i*4))
3418/// ELSE
3419/// result[j+31:j] := 0
3420/// FI
3421/// ENDFOR
3422/// \endcode
3423///
3424/// \headerfile <immintrin.h>
3425///
3426/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3427///
3428/// \param __X
3429/// A pointer to the memory used for loading values.
3430/// \param __M
3431/// A 256-bit vector of [8 x i32] containing the mask bits.
3432/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3433/// elements.
3434static __inline__ __m256i __DEFAULT_FN_ATTRS256
3435_mm256_maskload_epi32(int const *__X, __m256i __M)
3436{
3437 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3438}
3439
3440/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3441/// the most significant bit of the corresponding element in the mask
3442/// \a __M is set; otherwise, sets that element of the result to zero.
3443/// Returns the 256-bit [4 x i64] result.
3444///
3445/// \code{.operation}
3446/// FOR i := 0 TO 3
3447/// j := i*64
3448/// IF __M[j+63] == 1
3449/// result[j+63:j] := Load64(__X+(i*8))
3450/// ELSE
3451/// result[j+63:j] := 0
3452/// FI
3453/// ENDFOR
3454/// \endcode
3455///
3456/// \headerfile <immintrin.h>
3457///
3458/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3459///
3460/// \param __X
3461/// A pointer to the memory used for loading values.
3462/// \param __M
3463/// A 256-bit vector of [4 x i64] containing the mask bits.
3464/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3465/// elements.
3466static __inline__ __m256i __DEFAULT_FN_ATTRS256
3467_mm256_maskload_epi64(long long const *__X, __m256i __M)
3468{
3469 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3470}
3471
3472/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3473/// the most significant bit of the corresponding element in the mask
3474/// \a __M is set; otherwise, sets that element of the result to zero.
3475/// Returns the 128-bit [4 x i32] result.
3476///
3477/// \code{.operation}
3478/// FOR i := 0 TO 3
3479/// j := i*32
3480/// IF __M[j+31] == 1
3481/// result[j+31:j] := Load32(__X+(i*4))
3482/// ELSE
3483/// result[j+31:j] := 0
3484/// FI
3485/// ENDFOR
3486/// \endcode
3487///
3488/// \headerfile <immintrin.h>
3489///
3490/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3491///
3492/// \param __X
3493/// A pointer to the memory used for loading values.
3494/// \param __M
3495/// A 128-bit vector of [4 x i32] containing the mask bits.
3496/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3497/// elements.
3498static __inline__ __m128i __DEFAULT_FN_ATTRS128
3499_mm_maskload_epi32(int const *__X, __m128i __M)
3500{
3501 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3502}
3503
3504/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3505/// the most significant bit of the corresponding element in the mask
3506/// \a __M is set; otherwise, sets that element of the result to zero.
3507/// Returns the 128-bit [2 x i64] result.
3508///
3509/// \code{.operation}
3510/// FOR i := 0 TO 1
3511/// j := i*64
3512/// IF __M[j+63] == 1
3513/// result[j+63:j] := Load64(__X+(i*8))
3514/// ELSE
3515/// result[j+63:j] := 0
3516/// FI
3517/// ENDFOR
3518/// \endcode
3519///
3520/// \headerfile <immintrin.h>
3521///
3522/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3523///
3524/// \param __X
3525/// A pointer to the memory used for loading values.
3526/// \param __M
3527/// A 128-bit vector of [2 x i64] containing the mask bits.
3528/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3529/// elements.
3530static __inline__ __m128i __DEFAULT_FN_ATTRS128
3531_mm_maskload_epi64(long long const *__X, __m128i __M)
3532{
3533 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3534}
3535
3536/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3537/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3538/// the corresponding element in the mask \a __M is set; otherwise, the
3539/// memory element is unchanged.
3540///
3541/// \code{.operation}
3542/// FOR i := 0 TO 7
3543/// j := i*32
3544/// IF __M[j+31] == 1
3545/// Store32(__X+(i*4), __Y[j+31:j])
3546/// FI
3547/// ENDFOR
3548/// \endcode
3549///
3550/// \headerfile <immintrin.h>
3551///
3552/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3553///
3554/// \param __X
3555/// A pointer to the memory used for storing values.
3556/// \param __M
3557/// A 256-bit vector of [8 x i32] containing the mask bits.
3558/// \param __Y
3559/// A 256-bit vector of [8 x i32] containing the values to store.
3560static __inline__ void __DEFAULT_FN_ATTRS256
3561_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3562{
3563 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3564}
3565
3566/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3567/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3568/// the corresponding element in the mask \a __M is set; otherwise, the
3569/// memory element is unchanged.
3570///
3571/// \code{.operation}
3572/// FOR i := 0 TO 3
3573/// j := i*64
3574/// IF __M[j+63] == 1
3575/// Store64(__X+(i*8), __Y[j+63:j])
3576/// FI
3577/// ENDFOR
3578/// \endcode
3579///
3580/// \headerfile <immintrin.h>
3581///
3582/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3583///
3584/// \param __X
3585/// A pointer to the memory used for storing values.
3586/// \param __M
3587/// A 256-bit vector of [4 x i64] containing the mask bits.
3588/// \param __Y
3589/// A 256-bit vector of [4 x i64] containing the values to store.
3590static __inline__ void __DEFAULT_FN_ATTRS256
3591_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3592{
3593 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3594}
3595
3596/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3597/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3598/// the corresponding element in the mask \a __M is set; otherwise, the
3599/// memory element is unchanged.
3600///
3601/// \code{.operation}
3602/// FOR i := 0 TO 3
3603/// j := i*32
3604/// IF __M[j+31] == 1
3605/// Store32(__X+(i*4), __Y[j+31:j])
3606/// FI
3607/// ENDFOR
3608/// \endcode
3609///
3610/// \headerfile <immintrin.h>
3611///
3612/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3613///
3614/// \param __X
3615/// A pointer to the memory used for storing values.
3616/// \param __M
3617/// A 128-bit vector of [4 x i32] containing the mask bits.
3618/// \param __Y
3619/// A 128-bit vector of [4 x i32] containing the values to store.
3620static __inline__ void __DEFAULT_FN_ATTRS128
3621_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3622{
3623 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3624}
3625
3626/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3627/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3628/// the corresponding element in the mask \a __M is set; otherwise, the
3629/// memory element is unchanged.
3630///
3631/// \code{.operation}
3632/// FOR i := 0 TO 1
3633/// j := i*64
3634/// IF __M[j+63] == 1
3635/// Store64(__X+(i*8), __Y[j+63:j])
3636/// FI
3637/// ENDFOR
3638/// \endcode
3639///
3640/// \headerfile <immintrin.h>
3641///
3642/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3643///
3644/// \param __X
3645/// A pointer to the memory used for storing values.
3646/// \param __M
3647/// A 128-bit vector of [2 x i64] containing the mask bits.
3648/// \param __Y
3649/// A 128-bit vector of [2 x i64] containing the values to store.
3650static __inline__ void __DEFAULT_FN_ATTRS128
3651_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3652{
3653 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3654}
3655
3656/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3657/// left by the number of bits given in the corresponding element of the
3658/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3659/// returns the result. If the shift count for any element is greater than
3660/// 31, the result for that element is zero.
3661///
3662/// \headerfile <immintrin.h>
3663///
3664/// This intrinsic corresponds to the \c VPSLLVD instruction.
3665///
3666/// \param __X
3667/// A 256-bit vector of [8 x i32] to be shifted.
3668/// \param __Y
3669/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3670/// bits).
3671/// \returns A 256-bit vector of [8 x i32] containing the result.
3672static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3673_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3674{
3675 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3676}
3677
3678/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3679/// left by the number of bits given in the corresponding element of the
3680/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3681/// returns the result. If the shift count for any element is greater than
3682/// 31, the result for that element is zero.
3683///
3684/// \headerfile <immintrin.h>
3685///
3686/// This intrinsic corresponds to the \c VPSLLVD instruction.
3687///
3688/// \param __X
3689/// A 128-bit vector of [4 x i32] to be shifted.
3690/// \param __Y
3691/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3692/// bits).
3693/// \returns A 128-bit vector of [4 x i32] containing the result.
3694static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3695_mm_sllv_epi32(__m128i __X, __m128i __Y)
3696{
3697 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3698}
3699
3700/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3701/// left by the number of bits given in the corresponding element of the
3702/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3703/// returns the result. If the shift count for any element is greater than
3704/// 63, the result for that element is zero.
3705///
3706/// \headerfile <immintrin.h>
3707///
3708/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3709///
3710/// \param __X
3711/// A 256-bit vector of [4 x i64] to be shifted.
3712/// \param __Y
3713/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3714/// bits).
3715/// \returns A 256-bit vector of [4 x i64] containing the result.
3716static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3717_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3718{
3719 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3720}
3721
3722/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3723/// left by the number of bits given in the corresponding element of the
3724/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3725/// returns the result. If the shift count for any element is greater than
3726/// 63, the result for that element is zero.
3727///
3728/// \headerfile <immintrin.h>
3729///
3730/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3731///
3732/// \param __X
3733/// A 128-bit vector of [2 x i64] to be shifted.
3734/// \param __Y
3735/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3736/// bits).
3737/// \returns A 128-bit vector of [2 x i64] containing the result.
3738static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3739_mm_sllv_epi64(__m128i __X, __m128i __Y)
3740{
3741 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3742}
3743
3744/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3745/// right by the number of bits given in the corresponding element of the
3746/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3747/// returns the result. If the shift count for any element is greater than
3748/// 31, the result for that element is 0 or -1 according to the sign bit
3749/// for that element.
3750///
3751/// \headerfile <immintrin.h>
3752///
3753/// This intrinsic corresponds to the \c VPSRAVD instruction.
3754///
3755/// \param __X
3756/// A 256-bit vector of [8 x i32] to be shifted.
3757/// \param __Y
3758/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3759/// bits).
3760/// \returns A 256-bit vector of [8 x i32] containing the result.
3761static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3762_mm256_srav_epi32(__m256i __X, __m256i __Y)
3763{
3764 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3765}
3766
3767/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3768/// right by the number of bits given in the corresponding element of the
3769/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3770/// returns the result. If the shift count for any element is greater than
3771/// 31, the result for that element is 0 or -1 according to the sign bit
3772/// for that element.
3773///
3774/// \headerfile <immintrin.h>
3775///
3776/// This intrinsic corresponds to the \c VPSRAVD instruction.
3777///
3778/// \param __X
3779/// A 128-bit vector of [4 x i32] to be shifted.
3780/// \param __Y
3781/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3782/// bits).
3783/// \returns A 128-bit vector of [4 x i32] containing the result.
3784static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3785_mm_srav_epi32(__m128i __X, __m128i __Y)
3786{
3787 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3788}
3789
3790/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3791/// right by the number of bits given in the corresponding element of the
3792/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3793/// returns the result. If the shift count for any element is greater than
3794/// 31, the result for that element is zero.
3795///
3796/// \headerfile <immintrin.h>
3797///
3798/// This intrinsic corresponds to the \c VPSRLVD instruction.
3799///
3800/// \param __X
3801/// A 256-bit vector of [8 x i32] to be shifted.
3802/// \param __Y
3803/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3804/// bits).
3805/// \returns A 256-bit vector of [8 x i32] containing the result.
3806static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3807_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3808{
3809 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3810}
3811
3812/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3813/// right by the number of bits given in the corresponding element of the
3814/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3815/// returns the result. If the shift count for any element is greater than
3816/// 31, the result for that element is zero.
3817///
3818/// \headerfile <immintrin.h>
3819///
3820/// This intrinsic corresponds to the \c VPSRLVD instruction.
3821///
3822/// \param __X
3823/// A 128-bit vector of [4 x i32] to be shifted.
3824/// \param __Y
3825/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3826/// bits).
3827/// \returns A 128-bit vector of [4 x i32] containing the result.
3828static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3829_mm_srlv_epi32(__m128i __X, __m128i __Y)
3830{
3831 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3832}
3833
3834/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3835/// right by the number of bits given in the corresponding element of the
3836/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3837/// returns the result. If the shift count for any element is greater than
3838/// 63, the result for that element is zero.
3839///
3840/// \headerfile <immintrin.h>
3841///
3842/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3843///
3844/// \param __X
3845/// A 256-bit vector of [4 x i64] to be shifted.
3846/// \param __Y
3847/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3848/// bits).
3849/// \returns A 256-bit vector of [4 x i64] containing the result.
3850static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3851_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3852{
3853 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3854}
3855
3856/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3857/// right by the number of bits given in the corresponding element of the
3858/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3859/// returns the result. If the shift count for any element is greater than
3860/// 63, the result for that element is zero.
3861///
3862/// \headerfile <immintrin.h>
3863///
3864/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3865///
3866/// \param __X
3867/// A 128-bit vector of [2 x i64] to be shifted.
3868/// \param __Y
3869/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3870/// bits).
3871/// \returns A 128-bit vector of [2 x i64] containing the result.
3872static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3873_mm_srlv_epi64(__m128i __X, __m128i __Y)
3874{
3875 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3876}
3877
3878/// Conditionally gathers two 64-bit floating-point values, either from the
3879/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3880/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3881/// of [2 x double] in \a mask determines the source for each element.
3882///
3883/// \code{.operation}
3884/// FOR element := 0 to 1
3885/// j := element*64
3886/// k := element*32
3887/// IF mask[j+63] == 0
3888/// result[j+63:j] := a[j+63:j]
3889/// ELSE
3890/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3891/// FI
3892/// ENDFOR
3893/// \endcode
3894///
3895/// \headerfile <immintrin.h>
3896///
3897/// \code
3898/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3899/// __m128d mask, const int s);
3900/// \endcode
3901///
3902/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3903///
3904/// \param a
3905/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3906/// zero.
3907/// \param m
3908/// A pointer to the memory used for loading values.
3909/// \param i
3910/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3911/// the first two elements are used.
3912/// \param mask
3913/// A 128-bit vector of [2 x double] containing the mask. The most
3914/// significant bit of each element in the mask vector represents the mask
3915/// bits. If a mask bit is zero, the corresponding value from vector \a a
3916/// is gathered; otherwise the value is loaded from memory.
3917/// \param s
3918/// A literal constant scale factor for the indexes in \a i. Must be
3919/// 1, 2, 4, or 8.
3920/// \returns A 128-bit vector of [2 x double] containing the gathered values.
3921#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
3922 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
3923 (double const *)(m), \
3924 (__v4si)(__m128i)(i), \
3925 (__v2df)(__m128d)(mask), (s)))
3926
3927/// Conditionally gathers four 64-bit floating-point values, either from the
3928/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
3929/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
3930/// of [4 x double] in \a mask determines the source for each element.
3931///
3932/// \code{.operation}
3933/// FOR element := 0 to 3
3934/// j := element*64
3935/// k := element*32
3936/// IF mask[j+63] == 0
3937/// result[j+63:j] := a[j+63:j]
3938/// ELSE
3939/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3940/// FI
3941/// ENDFOR
3942/// \endcode
3943///
3944/// \headerfile <immintrin.h>
3945///
3946/// \code
3947/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
3948/// __m256d mask, const int s);
3949/// \endcode
3950///
3951/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3952///
3953/// \param a
3954/// A 256-bit vector of [4 x double] used as the source when a mask bit is
3955/// zero.
3956/// \param m
3957/// A pointer to the memory used for loading values.
3958/// \param i
3959/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3960/// \param mask
3961/// A 256-bit vector of [4 x double] containing the mask. The most
3962/// significant bit of each element in the mask vector represents the mask
3963/// bits. If a mask bit is zero, the corresponding value from vector \a a
3964/// is gathered; otherwise the value is loaded from memory.
3965/// \param s
3966/// A literal constant scale factor for the indexes in \a i. Must be
3967/// 1, 2, 4, or 8.
3968/// \returns A 256-bit vector of [4 x double] containing the gathered values.
3969#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
3970 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
3971 (double const *)(m), \
3972 (__v4si)(__m128i)(i), \
3973 (__v4df)(__m256d)(mask), (s)))
3974
3975/// Conditionally gathers two 64-bit floating-point values, either from the
3976/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3977/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3978/// of [2 x double] in \a mask determines the source for each element.
3979///
3980/// \code{.operation}
3981/// FOR element := 0 to 1
3982/// j := element*64
3983/// k := element*64
3984/// IF mask[j+63] == 0
3985/// result[j+63:j] := a[j+63:j]
3986/// ELSE
3987/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3988/// FI
3989/// ENDFOR
3990/// \endcode
3991///
3992/// \headerfile <immintrin.h>
3993///
3994/// \code
3995/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
3996/// __m128d mask, const int s);
3997/// \endcode
3998///
3999/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4000///
4001/// \param a
4002/// A 128-bit vector of [2 x double] used as the source when a mask bit is
4003/// zero.
4004/// \param m
4005/// A pointer to the memory used for loading values.
4006/// \param i
4007/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4008/// \param mask
4009/// A 128-bit vector of [2 x double] containing the mask. The most
4010/// significant bit of each element in the mask vector represents the mask
4011/// bits. If a mask bit is zero, the corresponding value from vector \a a
4012/// is gathered; otherwise the value is loaded from memory.
4013/// \param s
4014/// A literal constant scale factor for the indexes in \a i. Must be
4015/// 1, 2, 4, or 8.
4016/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4017#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4018 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4019 (double const *)(m), \
4020 (__v2di)(__m128i)(i), \
4021 (__v2df)(__m128d)(mask), (s)))
4022
4023/// Conditionally gathers four 64-bit floating-point values, either from the
4024/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4025/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4026/// of [4 x double] in \a mask determines the source for each element.
4027///
4028/// \code{.operation}
4029/// FOR element := 0 to 3
4030/// j := element*64
4031/// k := element*64
4032/// IF mask[j+63] == 0
4033/// result[j+63:j] := a[j+63:j]
4034/// ELSE
4035/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4036/// FI
4037/// ENDFOR
4038/// \endcode
4039///
4040/// \headerfile <immintrin.h>
4041///
4042/// \code
4043/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4044/// __m256d mask, const int s);
4045/// \endcode
4046///
4047/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4048///
4049/// \param a
4050/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4051/// zero.
4052/// \param m
4053/// A pointer to the memory used for loading values.
4054/// \param i
4055/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4056/// \param mask
4057/// A 256-bit vector of [4 x double] containing the mask. The most
4058/// significant bit of each element in the mask vector represents the mask
4059/// bits. If a mask bit is zero, the corresponding value from vector \a a
4060/// is gathered; otherwise the value is loaded from memory.
4061/// \param s
4062/// A literal constant scale factor for the indexes in \a i. Must be
4063/// 1, 2, 4, or 8.
4064/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4065#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4066 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4067 (double const *)(m), \
4068 (__v4di)(__m256i)(i), \
4069 (__v4df)(__m256d)(mask), (s)))
4070
4071/// Conditionally gathers four 32-bit floating-point values, either from the
4072/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4073/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4074/// of [4 x float] in \a mask determines the source for each element.
4075///
4076/// \code{.operation}
4077/// FOR element := 0 to 3
4078/// j := element*32
4079/// k := element*32
4080/// IF mask[j+31] == 0
4081/// result[j+31:j] := a[j+31:j]
4082/// ELSE
4083/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4084/// FI
4085/// ENDFOR
4086/// \endcode
4087///
4088/// \headerfile <immintrin.h>
4089///
4090/// \code
4091/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4092/// __m128 mask, const int s);
4093/// \endcode
4094///
4095/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4096///
4097/// \param a
4098/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4099/// zero.
4100/// \param m
4101/// A pointer to the memory used for loading values.
4102/// \param i
4103/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4104/// \param mask
4105/// A 128-bit vector of [4 x float] containing the mask. The most
4106/// significant bit of each element in the mask vector represents the mask
4107/// bits. If a mask bit is zero, the corresponding value from vector \a a
4108/// is gathered; otherwise the value is loaded from memory.
4109/// \param s
4110/// A literal constant scale factor for the indexes in \a i. Must be
4111/// 1, 2, 4, or 8.
4112/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4113#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4114 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4115 (float const *)(m), \
4116 (__v4si)(__m128i)(i), \
4117 (__v4sf)(__m128)(mask), (s)))
4118
4119/// Conditionally gathers eight 32-bit floating-point values, either from the
4120/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4121/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4122/// of [8 x float] in \a mask determines the source for each element.
4123///
4124/// \code{.operation}
4125/// FOR element := 0 to 7
4126/// j := element*32
4127/// k := element*32
4128/// IF mask[j+31] == 0
4129/// result[j+31:j] := a[j+31:j]
4130/// ELSE
4131/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4132/// FI
4133/// ENDFOR
4134/// \endcode
4135///
4136/// \headerfile <immintrin.h>
4137///
4138/// \code
4139/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4140/// __m256 mask, const int s);
4141/// \endcode
4142///
4143/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4144///
4145/// \param a
4146/// A 256-bit vector of [8 x float] used as the source when a mask bit is
4147/// zero.
4148/// \param m
4149/// A pointer to the memory used for loading values.
4150/// \param i
4151/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4152/// \param mask
4153/// A 256-bit vector of [8 x float] containing the mask. The most
4154/// significant bit of each element in the mask vector represents the mask
4155/// bits. If a mask bit is zero, the corresponding value from vector \a a
4156/// is gathered; otherwise the value is loaded from memory.
4157/// \param s
4158/// A literal constant scale factor for the indexes in \a i. Must be
4159/// 1, 2, 4, or 8.
4160/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4161#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4162 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4163 (float const *)(m), \
4164 (__v8si)(__m256i)(i), \
4165 (__v8sf)(__m256)(mask), (s)))
4166
4167/// Conditionally gathers two 32-bit floating-point values, either from the
4168/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4169/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4170/// of [4 x float] in \a mask determines the source for the lower two
4171/// elements. The upper two elements of the result are zeroed.
4172///
4173/// \code{.operation}
4174/// FOR element := 0 to 1
4175/// j := element*32
4176/// k := element*64
4177/// IF mask[j+31] == 0
4178/// result[j+31:j] := a[j+31:j]
4179/// ELSE
4180/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4181/// FI
4182/// ENDFOR
4183/// result[127:64] := 0
4184/// \endcode
4185///
4186/// \headerfile <immintrin.h>
4187///
4188/// \code
4189/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4190/// __m128 mask, const int s);
4191/// \endcode
4192///
4193/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4194///
4195/// \param a
4196/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4197/// zero. Only the first two elements are used.
4198/// \param m
4199/// A pointer to the memory used for loading values.
4200/// \param i
4201/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4202/// \param mask
4203/// A 128-bit vector of [4 x float] containing the mask. The most
4204/// significant bit of each element in the mask vector represents the mask
4205/// bits. If a mask bit is zero, the corresponding value from vector \a a
4206/// is gathered; otherwise the value is loaded from memory. Only the first
4207/// two elements are used.
4208/// \param s
4209/// A literal constant scale factor for the indexes in \a i. Must be
4210/// 1, 2, 4, or 8.
4211/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4212#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4213 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4214 (float const *)(m), \
4215 (__v2di)(__m128i)(i), \
4216 (__v4sf)(__m128)(mask), (s)))
4217
4218/// Conditionally gathers four 32-bit floating-point values, either from the
4219/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4220/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4221/// of [4 x float] in \a mask determines the source for each element.
4222///
4223/// \code{.operation}
4224/// FOR element := 0 to 3
4225/// j := element*32
4226/// k := element*64
4227/// IF mask[j+31] == 0
4228/// result[j+31:j] := a[j+31:j]
4229/// ELSE
4230/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4231/// FI
4232/// ENDFOR
4233/// \endcode
4234///
4235/// \headerfile <immintrin.h>
4236///
4237/// \code
4238/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4239/// __m128 mask, const int s);
4240/// \endcode
4241///
4242/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4243///
4244/// \param a
4245/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4246/// zero.
4247/// \param m
4248/// A pointer to the memory used for loading values.
4249/// \param i
4250/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4251/// \param mask
4252/// A 128-bit vector of [4 x float] containing the mask. The most
4253/// significant bit of each element in the mask vector represents the mask
4254/// bits. If a mask bit is zero, the corresponding value from vector \a a
4255/// is gathered; otherwise the value is loaded from memory.
4256/// \param s
4257/// A literal constant scale factor for the indexes in \a i. Must be
4258/// 1, 2, 4, or 8.
4259/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4260#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4261 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4262 (float const *)(m), \
4263 (__v4di)(__m256i)(i), \
4264 (__v4sf)(__m128)(mask), (s)))
4265
4266/// Conditionally gathers four 32-bit integer values, either from the
4267/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4268/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4269/// of [4 x i32] in \a mask determines the source for each element.
4270///
4271/// \code{.operation}
4272/// FOR element := 0 to 3
4273/// j := element*32
4274/// k := element*32
4275/// IF mask[j+31] == 0
4276/// result[j+31:j] := a[j+31:j]
4277/// ELSE
4278/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4279/// FI
4280/// ENDFOR
4281/// \endcode
4282///
4283/// \headerfile <immintrin.h>
4284///
4285/// \code
4286/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4287/// __m128i mask, const int s);
4288/// \endcode
4289///
4290/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4291///
4292/// \param a
4293/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4294/// zero.
4295/// \param m
4296/// A pointer to the memory used for loading values.
4297/// \param i
4298/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4299/// \param mask
4300/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4301/// bit of each element in the mask vector represents the mask bits. If a
4302/// mask bit is zero, the corresponding value from vector \a a is gathered;
4303/// otherwise the value is loaded from memory.
4304/// \param s
4305/// A literal constant scale factor for the indexes in \a i. Must be
4306/// 1, 2, 4, or 8.
4307/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4308#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4309 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4310 (int const *)(m), \
4311 (__v4si)(__m128i)(i), \
4312 (__v4si)(__m128i)(mask), (s)))
4313
4314/// Conditionally gathers eight 32-bit integer values, either from the
4315/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4316/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4317/// of [8 x i32] in \a mask determines the source for each element.
4318///
4319/// \code{.operation}
4320/// FOR element := 0 to 7
4321/// j := element*32
4322/// k := element*32
4323/// IF mask[j+31] == 0
4324/// result[j+31:j] := a[j+31:j]
4325/// ELSE
4326/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4327/// FI
4328/// ENDFOR
4329/// \endcode
4330///
4331/// \headerfile <immintrin.h>
4332///
4333/// \code
4334/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4335/// __m256i mask, const int s);
4336/// \endcode
4337///
4338/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4339///
4340/// \param a
4341/// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4342/// zero.
4343/// \param m
4344/// A pointer to the memory used for loading values.
4345/// \param i
4346/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4347/// \param mask
4348/// A 256-bit vector of [8 x i32] containing the mask. The most significant
4349/// bit of each element in the mask vector represents the mask bits. If a
4350/// mask bit is zero, the corresponding value from vector \a a is gathered;
4351/// otherwise the value is loaded from memory.
4352/// \param s
4353/// A literal constant scale factor for the indexes in \a i. Must be
4354/// 1, 2, 4, or 8.
4355/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4356#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4357 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4358 (int const *)(m), \
4359 (__v8si)(__m256i)(i), \
4360 (__v8si)(__m256i)(mask), (s)))
4361
4362/// Conditionally gathers two 32-bit integer values, either from the
4363/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4364/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4365/// of [4 x i32] in \a mask determines the source for the lower two
4366/// elements. The upper two elements of the result are zeroed.
4367///
4368/// \code{.operation}
4369/// FOR element := 0 to 1
4370/// j := element*32
4371/// k := element*64
4372/// IF mask[j+31] == 0
4373/// result[j+31:j] := a[j+31:j]
4374/// ELSE
4375/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4376/// FI
4377/// ENDFOR
4378/// result[127:64] := 0
4379/// \endcode
4380///
4381/// \headerfile <immintrin.h>
4382///
4383/// \code
4384/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4385/// __m128i mask, const int s);
4386/// \endcode
4387///
4388/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4389///
4390/// \param a
4391/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4392/// zero. Only the first two elements are used.
4393/// \param m
4394/// A pointer to the memory used for loading values.
4395/// \param i
4396/// A 128-bit vector of [2 x i64] containing indexes into \a m.
4397/// \param mask
4398/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4399/// bit of each element in the mask vector represents the mask bits. If a
4400/// mask bit is zero, the corresponding value from vector \a a is gathered;
4401/// otherwise the value is loaded from memory. Only the first two elements
4402/// are used.
4403/// \param s
4404/// A literal constant scale factor for the indexes in \a i. Must be
4405/// 1, 2, 4, or 8.
4406/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4407#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4408 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4409 (int const *)(m), \
4410 (__v2di)(__m128i)(i), \
4411 (__v4si)(__m128i)(mask), (s)))
4412
4413/// Conditionally gathers four 32-bit integer values, either from the
4414/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4415/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4416/// of [4 x i32] in \a mask determines the source for each element.
4417///
4418/// \code{.operation}
4419/// FOR element := 0 to 3
4420/// j := element*32
4421/// k := element*64
4422/// IF mask[j+31] == 0
4423/// result[j+31:j] := a[j+31:j]
4424/// ELSE
4425/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4426/// FI
4427/// ENDFOR
4428/// \endcode
4429///
4430/// \headerfile <immintrin.h>
4431///
4432/// \code
4433/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4434/// __m128i mask, const int s);
4435/// \endcode
4436///
4437/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4438///
4439/// \param a
4440/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4441/// zero.
4442/// \param m
4443/// A pointer to the memory used for loading values.
4444/// \param i
4445/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4446/// \param mask
4447/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4448/// bit of each element in the mask vector represents the mask bits. If a
4449/// mask bit is zero, the corresponding value from vector \a a is gathered;
4450/// otherwise the value is loaded from memory.
4451/// \param s
4452/// A literal constant scale factor for the indexes in \a i. Must be
4453/// 1, 2, 4, or 8.
4454/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4455#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4456 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4457 (int const *)(m), \
4458 (__v4di)(__m256i)(i), \
4459 (__v4si)(__m128i)(mask), (s)))
4460
4461/// Conditionally gathers two 64-bit integer values, either from the
4462/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4463/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4464/// of [2 x i64] in \a mask determines the source for each element.
4465///
4466/// \code{.operation}
4467/// FOR element := 0 to 1
4468/// j := element*64
4469/// k := element*32
4470/// IF mask[j+63] == 0
4471/// result[j+63:j] := a[j+63:j]
4472/// ELSE
4473/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4474/// FI
4475/// ENDFOR
4476/// \endcode
4477///
4478/// \headerfile <immintrin.h>
4479///
4480/// \code
4481/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4482/// __m128i mask, const int s);
4483/// \endcode
4484///
4485/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4486///
4487/// \param a
4488/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4489/// zero.
4490/// \param m
4491/// A pointer to the memory used for loading values.
4492/// \param i
4493/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4494/// the first two elements are used.
4495/// \param mask
4496/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4497/// bit of each element in the mask vector represents the mask bits. If a
4498/// mask bit is zero, the corresponding value from vector \a a is gathered;
4499/// otherwise the value is loaded from memory.
4500/// \param s
4501/// A literal constant scale factor for the indexes in \a i. Must be
4502/// 1, 2, 4, or 8.
4503/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4504#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4505 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4506 (long long const *)(m), \
4507 (__v4si)(__m128i)(i), \
4508 (__v2di)(__m128i)(mask), (s)))
4509
4510/// Conditionally gathers four 64-bit integer values, either from the
4511/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4512/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4513/// of [4 x i64] in \a mask determines the source for each element.
4514///
4515/// \code{.operation}
4516/// FOR element := 0 to 3
4517/// j := element*64
4518/// k := element*32
4519/// IF mask[j+63] == 0
4520/// result[j+63:j] := a[j+63:j]
4521/// ELSE
4522/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4523/// FI
4524/// ENDFOR
4525/// \endcode
4526///
4527/// \headerfile <immintrin.h>
4528///
4529/// \code
4530/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4531/// __m128i i, __m256i mask, const int s);
4532/// \endcode
4533///
4534/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4535///
4536/// \param a
4537/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4538/// zero.
4539/// \param m
4540/// A pointer to the memory used for loading values.
4541/// \param i
4542/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4543/// \param mask
4544/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4545/// bit of each element in the mask vector represents the mask bits. If a
4546/// mask bit is zero, the corresponding value from vector \a a is gathered;
4547/// otherwise the value is loaded from memory.
4548/// \param s
4549/// A literal constant scale factor for the indexes in \a i. Must be
4550/// 1, 2, 4, or 8.
4551/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4552#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4553 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4554 (long long const *)(m), \
4555 (__v4si)(__m128i)(i), \
4556 (__v4di)(__m256i)(mask), (s)))
4557
4558/// Conditionally gathers two 64-bit integer values, either from the
4559/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4560/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4561/// of [2 x i64] in \a mask determines the source for each element.
4562///
4563/// \code{.operation}
4564/// FOR element := 0 to 1
4565/// j := element*64
4566/// k := element*64
4567/// IF mask[j+63] == 0
4568/// result[j+63:j] := a[j+63:j]
4569/// ELSE
4570/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4571/// FI
4572/// ENDFOR
4573/// \endcode
4574///
4575/// \headerfile <immintrin.h>
4576///
4577/// \code
4578/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4579/// __m128i mask, const int s);
4580/// \endcode
4581///
4582/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4583///
4584/// \param a
4585/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4586/// zero.
4587/// \param m
4588/// A pointer to the memory used for loading values.
4589/// \param i
4590/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4591/// \param mask
4592/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4593/// bit of each element in the mask vector represents the mask bits. If a
4594/// mask bit is zero, the corresponding value from vector \a a is gathered;
4595/// otherwise the value is loaded from memory.
4596/// \param s
4597/// A literal constant scale factor for the indexes in \a i. Must be
4598/// 1, 2, 4, or 8.
4599/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4600#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4601 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4602 (long long const *)(m), \
4603 (__v2di)(__m128i)(i), \
4604 (__v2di)(__m128i)(mask), (s)))
4605
4606/// Conditionally gathers four 64-bit integer values, either from the
4607/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4608/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4609/// of [4 x i64] in \a mask determines the source for each element.
4610///
4611/// \code{.operation}
4612/// FOR element := 0 to 3
4613/// j := element*64
4614/// k := element*64
4615/// IF mask[j+63] == 0
4616/// result[j+63:j] := a[j+63:j]
4617/// ELSE
4618/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4619/// FI
4620/// ENDFOR
4621/// \endcode
4622///
4623/// \headerfile <immintrin.h>
4624///
4625/// \code
4626/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4627/// __m256i i, __m256i mask, const int s);
4628/// \endcode
4629///
4630/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4631///
4632/// \param a
4633/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4634/// zero.
4635/// \param m
4636/// A pointer to the memory used for loading values.
4637/// \param i
4638/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4639/// \param mask
4640/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4641/// bit of each element in the mask vector represents the mask bits. If a
4642/// mask bit is zero, the corresponding value from vector \a a is gathered;
4643/// otherwise the value is loaded from memory.
4644/// \param s
4645/// A literal constant scale factor for the indexes in \a i. Must be
4646/// 1, 2, 4, or 8.
4647/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4648#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4649 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4650 (long long const *)(m), \
4651 (__v4di)(__m256i)(i), \
4652 (__v4di)(__m256i)(mask), (s)))
4653
4654/// Gathers two 64-bit floating-point values from memory \a m using scaled
4655/// indexes from the 128-bit vector of [4 x i32] in \a i.
4656///
4657/// \code{.operation}
4658/// FOR element := 0 to 1
4659/// j := element*64
4660/// k := element*32
4661/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4662/// ENDFOR
4663/// \endcode
4664///
4665/// \headerfile <immintrin.h>
4666///
4667/// \code
4668/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4669/// \endcode
4670///
4671/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4672///
4673/// \param m
4674/// A pointer to the memory used for loading values.
4675/// \param i
4676/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4677/// the first two elements are used.
4678/// \param s
4679/// A literal constant scale factor for the indexes in \a i. Must be
4680/// 1, 2, 4, or 8.
4681/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4682#define _mm_i32gather_pd(m, i, s) \
4683 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4684 (double const *)(m), \
4685 (__v4si)(__m128i)(i), \
4686 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4687 _mm_setzero_pd()), \
4688 (s)))
4689
4690/// Gathers four 64-bit floating-point values from memory \a m using scaled
4691/// indexes from the 128-bit vector of [4 x i32] in \a i.
4692///
4693/// \code{.operation}
4694/// FOR element := 0 to 3
4695/// j := element*64
4696/// k := element*32
4697/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4698/// ENDFOR
4699/// \endcode
4700///
4701/// \headerfile <immintrin.h>
4702///
4703/// \code
4704/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4705/// \endcode
4706///
4707/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4708///
4709/// \param m
4710/// A pointer to the memory used for loading values.
4711/// \param i
4712/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4713/// \param s
4714/// A literal constant scale factor for the indexes in \a i. Must be
4715/// 1, 2, 4, or 8.
4716/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4717#define _mm256_i32gather_pd(m, i, s) \
4718 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4719 (double const *)(m), \
4720 (__v4si)(__m128i)(i), \
4721 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4722 _mm256_setzero_pd(), \
4723 _CMP_EQ_OQ), \
4724 (s)))
4725
4726/// Gathers two 64-bit floating-point values from memory \a m using scaled
4727/// indexes from the 128-bit vector of [2 x i64] in \a i.
4728///
4729/// \code{.operation}
4730/// FOR element := 0 to 1
4731/// j := element*64
4732/// k := element*64
4733/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4734/// ENDFOR
4735/// \endcode
4736///
4737/// \headerfile <immintrin.h>
4738///
4739/// \code
4740/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4741/// \endcode
4742///
4743/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4744///
4745/// \param m
4746/// A pointer to the memory used for loading values.
4747/// \param i
4748/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4749/// \param s
4750/// A literal constant scale factor for the indexes in \a i. Must be
4751/// 1, 2, 4, or 8.
4752/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4753#define _mm_i64gather_pd(m, i, s) \
4754 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4755 (double const *)(m), \
4756 (__v2di)(__m128i)(i), \
4757 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4758 _mm_setzero_pd()), \
4759 (s)))
4760
4761/// Gathers four 64-bit floating-point values from memory \a m using scaled
4762/// indexes from the 256-bit vector of [4 x i64] in \a i.
4763///
4764/// \code{.operation}
4765/// FOR element := 0 to 3
4766/// j := element*64
4767/// k := element*64
4768/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4769/// ENDFOR
4770/// \endcode
4771///
4772/// \headerfile <immintrin.h>
4773///
4774/// \code
4775/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4776/// \endcode
4777///
4778/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4779///
4780/// \param m
4781/// A pointer to the memory used for loading values.
4782/// \param i
4783/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4784/// \param s
4785/// A literal constant scale factor for the indexes in \a i. Must be
4786/// 1, 2, 4, or 8.
4787/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4788#define _mm256_i64gather_pd(m, i, s) \
4789 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4790 (double const *)(m), \
4791 (__v4di)(__m256i)(i), \
4792 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4793 _mm256_setzero_pd(), \
4794 _CMP_EQ_OQ), \
4795 (s)))
4796
4797/// Gathers four 32-bit floating-point values from memory \a m using scaled
4798/// indexes from the 128-bit vector of [4 x i32] in \a i.
4799///
4800/// \code{.operation}
4801/// FOR element := 0 to 3
4802/// j := element*32
4803/// k := element*32
4804/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4805/// ENDFOR
4806/// \endcode
4807///
4808/// \headerfile <immintrin.h>
4809///
4810/// \code
4811/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4812/// \endcode
4813///
4814/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4815///
4816/// \param m
4817/// A pointer to the memory used for loading values.
4818/// \param i
4819/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4820/// \param s
4821/// A literal constant scale factor for the indexes in \a i. Must be
4822/// 1, 2, 4, or 8.
4823/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4824#define _mm_i32gather_ps(m, i, s) \
4825 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4826 (float const *)(m), \
4827 (__v4si)(__m128i)(i), \
4828 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4829 _mm_setzero_ps()), \
4830 (s)))
4831
4832/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4833/// indexes from the 256-bit vector of [8 x i32] in \a i.
4834///
4835/// \code{.operation}
4836/// FOR element := 0 to 7
4837/// j := element*32
4838/// k := element*32
4839/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4840/// ENDFOR
4841/// \endcode
4842///
4843/// \headerfile <immintrin.h>
4844///
4845/// \code
4846/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4847/// \endcode
4848///
4849/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4850///
4851/// \param m
4852/// A pointer to the memory used for loading values.
4853/// \param i
4854/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4855/// \param s
4856/// A literal constant scale factor for the indexes in \a i. Must be
4857/// 1, 2, 4, or 8.
4858/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4859#define _mm256_i32gather_ps(m, i, s) \
4860 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4861 (float const *)(m), \
4862 (__v8si)(__m256i)(i), \
4863 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4864 _mm256_setzero_ps(), \
4865 _CMP_EQ_OQ), \
4866 (s)))
4867
4868/// Gathers two 32-bit floating-point values from memory \a m using scaled
4869/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4870/// elements of the result are zeroed.
4871///
4872/// \code{.operation}
4873/// FOR element := 0 to 1
4874/// j := element*32
4875/// k := element*64
4876/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4877/// ENDFOR
4878/// result[127:64] := 0
4879/// \endcode
4880///
4881/// \headerfile <immintrin.h>
4882///
4883/// \code
4884/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4885/// \endcode
4886///
4887/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4888///
4889/// \param m
4890/// A pointer to the memory used for loading values.
4891/// \param i
4892/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4893/// \param s
4894/// A literal constant scale factor for the indexes in \a i. Must be
4895/// 1, 2, 4, or 8.
4896/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4897#define _mm_i64gather_ps(m, i, s) \
4898 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4899 (float const *)(m), \
4900 (__v2di)(__m128i)(i), \
4901 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4902 _mm_setzero_ps()), \
4903 (s)))
4904
4905/// Gathers four 32-bit floating-point values from memory \a m using scaled
4906/// indexes from the 256-bit vector of [4 x i64] in \a i.
4907///
4908/// \code{.operation}
4909/// FOR element := 0 to 3
4910/// j := element*32
4911/// k := element*64
4912/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4913/// ENDFOR
4914/// \endcode
4915///
4916/// \headerfile <immintrin.h>
4917///
4918/// \code
4919/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4920/// \endcode
4921///
4922/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4923///
4924/// \param m
4925/// A pointer to the memory used for loading values.
4926/// \param i
4927/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4928/// \param s
4929/// A literal constant scale factor for the indexes in \a i. Must be
4930/// 1, 2, 4, or 8.
4931/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4932#define _mm256_i64gather_ps(m, i, s) \
4933 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
4934 (float const *)(m), \
4935 (__v4di)(__m256i)(i), \
4936 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4937 _mm_setzero_ps()), \
4938 (s)))
4939
4940/// Gathers four 32-bit floating-point values from memory \a m using scaled
4941/// indexes from the 128-bit vector of [4 x i32] in \a i.
4942///
4943/// \code{.operation}
4944/// FOR element := 0 to 3
4945/// j := element*32
4946/// k := element*32
4947/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4948/// ENDFOR
4949/// \endcode
4950///
4951/// \headerfile <immintrin.h>
4952///
4953/// \code
4954/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
4955/// \endcode
4956///
4957/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4958///
4959/// \param m
4960/// A pointer to the memory used for loading values.
4961/// \param i
4962/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4963/// \param s
4964/// A literal constant scale factor for the indexes in \a i. Must be
4965/// 1, 2, 4, or 8.
4966/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4967#define _mm_i32gather_epi32(m, i, s) \
4968 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
4969 (int const *)(m), (__v4si)(__m128i)(i), \
4970 (__v4si)_mm_set1_epi32(-1), (s)))
4971
4972/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4973/// indexes from the 256-bit vector of [8 x i32] in \a i.
4974///
4975/// \code{.operation}
4976/// FOR element := 0 to 7
4977/// j := element*32
4978/// k := element*32
4979/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4980/// ENDFOR
4981/// \endcode
4982///
4983/// \headerfile <immintrin.h>
4984///
4985/// \code
4986/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
4987/// \endcode
4988///
4989/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4990///
4991/// \param m
4992/// A pointer to the memory used for loading values.
4993/// \param i
4994/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4995/// \param s
4996/// A literal constant scale factor for the indexes in \a i. Must be
4997/// 1, 2, 4, or 8.
4998/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4999#define _mm256_i32gather_epi32(m, i, s) \
5000 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5001 (int const *)(m), (__v8si)(__m256i)(i), \
5002 (__v8si)_mm256_set1_epi32(-1), (s)))
5003
5004/// Gathers two 32-bit integer values from memory \a m using scaled indexes
5005/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5006/// of the result are zeroed.
5007///
5008/// \code{.operation}
5009/// FOR element := 0 to 1
5010/// j := element*32
5011/// k := element*64
5012/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5013/// ENDFOR
5014/// result[127:64] := 0
5015/// \endcode
5016///
5017/// \headerfile <immintrin.h>
5018///
5019/// \code
5020/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5021/// \endcode
5022///
5023/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5024///
5025/// \param m
5026/// A pointer to the memory used for loading values.
5027/// \param i
5028/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5029/// \param s
5030/// A literal constant scale factor for the indexes in \a i. Must be
5031/// 1, 2, 4, or 8.
5032/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5033#define _mm_i64gather_epi32(m, i, s) \
5034 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5035 (int const *)(m), (__v2di)(__m128i)(i), \
5036 (__v4si)_mm_set1_epi32(-1), (s)))
5037
5038/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5039/// from the 256-bit vector of [4 x i64] in \a i.
5040///
5041/// \code{.operation}
5042/// FOR element := 0 to 3
5043/// j := element*32
5044/// k := element*64
5045/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5046/// ENDFOR
5047/// \endcode
5048///
5049/// \headerfile <immintrin.h>
5050///
5051/// \code
5052/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5053/// \endcode
5054///
5055/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5056///
5057/// \param m
5058/// A pointer to the memory used for loading values.
5059/// \param i
5060/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5061/// \param s
5062/// A literal constant scale factor for the indexes in \a i. Must be
5063/// 1, 2, 4, or 8.
5064/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5065#define _mm256_i64gather_epi32(m, i, s) \
5066 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5067 (int const *)(m), (__v4di)(__m256i)(i), \
5068 (__v4si)_mm_set1_epi32(-1), (s)))
5069
5070/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5071/// from the 128-bit vector of [4 x i32] in \a i.
5072///
5073/// \code{.operation}
5074/// FOR element := 0 to 1
5075/// j := element*64
5076/// k := element*32
5077/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5078/// ENDFOR
5079/// \endcode
5080///
5081/// \headerfile <immintrin.h>
5082///
5083/// \code
5084/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5085/// \endcode
5086///
5087/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5088///
5089/// \param m
5090/// A pointer to the memory used for loading values.
5091/// \param i
5092/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5093/// the first two elements are used.
5094/// \param s
5095/// A literal constant scale factor for the indexes in \a i. Must be
5096/// 1, 2, 4, or 8.
5097/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5098#define _mm_i32gather_epi64(m, i, s) \
5099 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5100 (long long const *)(m), \
5101 (__v4si)(__m128i)(i), \
5102 (__v2di)_mm_set1_epi64x(-1), (s)))
5103
5104/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5105/// from the 128-bit vector of [4 x i32] in \a i.
5106///
5107/// \code{.operation}
5108/// FOR element := 0 to 3
5109/// j := element*64
5110/// k := element*32
5111/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5112/// ENDFOR
5113/// \endcode
5114///
5115/// \headerfile <immintrin.h>
5116///
5117/// \code
5118/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5119/// \endcode
5120///
5121/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5122///
5123/// \param m
5124/// A pointer to the memory used for loading values.
5125/// \param i
5126/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5127/// \param s
5128/// A literal constant scale factor for the indexes in \a i. Must be
5129/// 1, 2, 4, or 8.
5130/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5131#define _mm256_i32gather_epi64(m, i, s) \
5132 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5133 (long long const *)(m), \
5134 (__v4si)(__m128i)(i), \
5135 (__v4di)_mm256_set1_epi64x(-1), (s)))
5136
5137/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5138/// from the 128-bit vector of [2 x i64] in \a i.
5139///
5140/// \code{.operation}
5141/// FOR element := 0 to 1
5142/// j := element*64
5143/// k := element*64
5144/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5145/// ENDFOR
5146/// \endcode
5147///
5148/// \headerfile <immintrin.h>
5149///
5150/// \code
5151/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5152/// \endcode
5153///
5154/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5155///
5156/// \param m
5157/// A pointer to the memory used for loading values.
5158/// \param i
5159/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5160/// \param s
5161/// A literal constant scale factor for the indexes in \a i. Must be
5162/// 1, 2, 4, or 8.
5163/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5164#define _mm_i64gather_epi64(m, i, s) \
5165 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5166 (long long const *)(m), \
5167 (__v2di)(__m128i)(i), \
5168 (__v2di)_mm_set1_epi64x(-1), (s)))
5169
5170/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5171/// from the 256-bit vector of [4 x i64] in \a i.
5172///
5173/// \code{.operation}
5174/// FOR element := 0 to 3
5175/// j := element*64
5176/// k := element*64
5177/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5178/// ENDFOR
5179/// \endcode
5180///
5181/// \headerfile <immintrin.h>
5182///
5183/// \code
5184/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5185/// \endcode
5186///
5187/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5188///
5189/// \param m
5190/// A pointer to the memory used for loading values.
5191/// \param i
5192/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5193/// \param s
5194/// A literal constant scale factor for the indexes in \a i. Must be
5195/// 1, 2, 4, or 8.
5196/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5197#define _mm256_i64gather_epi64(m, i, s) \
5198 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5199 (long long const *)(m), \
5200 (__v4di)(__m256i)(i), \
5201 (__v4di)_mm256_set1_epi64x(-1), (s)))
5202
5203#undef __DEFAULT_FN_ATTRS256
5204#undef __DEFAULT_FN_ATTRS128
5205#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
5206#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5207
5208#endif /* __AVX2INTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
Definition altivec.h:578
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi32_epi64(__m128i __V)
Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256(const void *__V)
Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vec...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsubs_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than an...
Definition avx2intrin.h:722
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epu32(__m256i __a, __m256i __b)
Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srav_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_andnot_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit int...
Definition avx2intrin.h:466
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation...
Definition avx2intrin.h:386
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maddubs_epi16(__m256i __a, __m256i __b)
Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed by...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64(long long const *__X, __m256i __M)
Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to mem...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epu16(__m256i __a, __m256i __b)
Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upp...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256...
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastsd_pd(__m128d __a)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32(int const *__X, __m256i __M)
Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi16(__m128i __V)
Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsi...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packus_epi32(__m256i __V1, __m256i __V2)
Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation,...
Definition avx2intrin.h:261
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi32(__m256i __a)
Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a a...
Definition avx2intrin.h:139
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srav_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi64(__m128i __V)
Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and r...
Definition avx2intrin.h:670
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed sa...
Definition avx2intrin.h:368
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits spec...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memor...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu32_epi64(__m128i __V)
Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi32(__m128i __V)
Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x f...
static __inline__ int __DEFAULT_FN_ATTRS256 _mm256_movemask_epi8(__m256i __a)
Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vecto...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi16(__m128i __V)
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the ...
Definition avx2intrin.h:551
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and r...
Definition avx2intrin.h:696
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32(int const *__X, __m128i __M)
Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsub_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and r...
Definition avx2intrin.h:938
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a, __m256i __b)
Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greate...
Definition avx2intrin.h:750
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsub_epi32(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and re...
Definition avx2intrin.h:969
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 b...
Definition avx2intrin.h:279
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi32(__m128i __V)
Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result'...
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadd_epi32(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and ret...
Definition avx2intrin.h:869
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater...
Definition avx2intrin.h:776
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi32(__m256i __a, __m256i __b)
Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned satur...
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using sign...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns t...
Definition avx2intrin.h:618
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi8(__m256i __a)
Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each va...
Definition avx2intrin.h:107
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi64(__m128i __V)
Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi64(__m128i __V)
Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadds_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using ...
Definition avx2intrin.h:903
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi64(__m256i __a, __m256i __b)
Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the ...
Definition avx2intrin.h:333
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi64(__m128i __V)
Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_and_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vectors in __a and __b.
Definition avx2intrin.h:448
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsi128_si256(__m128i __X)
Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi32(__m256i __a, __m256i __b)
Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the ...
Definition avx2intrin.h:315
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packus_epi16(__m256i __a, __m256i __b)
Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation,...
Definition avx2intrin.h:230
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi16(__m256i __a)
Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a ...
Definition avx2intrin.h:123
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the...
Definition avx2intrin.h:297
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_or_si256(__m256i __a, __m256i __b)
Computes the bitwise OR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsd_pd(__m128d __X)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi32(__m128i __V)
Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadd_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and re...
Definition avx2intrin.h:838
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu16(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16...
Definition avx2intrin.h:517
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_xor_si256(__m256i __a, __m256i __b)
Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi32(__m128i __V)
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned ...
Definition avx2intrin.h:403
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64(long long const *__X, __m128i __M)
Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the c...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in _...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi64(__m256i __a, __m256i __b)
Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_madd_epi16(__m256i __a, __m256i __b)
Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit interme...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturat...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packs_epi32(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation,...
Definition avx2intrin.h:200
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater...
Definition avx2intrin.h:802
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a, __m256i __b)
Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers fr...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation,...
Definition avx2intrin.h:351
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and ...
Definition avx2intrin.h:644
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packs_epi16(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation,...
Definition avx2intrin.h:169
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu8(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a a...
Definition avx2intrin.h:492
static __inline__ void int __a
Definition emmintrin.h:4077
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19