clang 22.0.0git
avx2intrin.h
Go to the documentation of this file.
1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX2INTRIN_H
15#define __AVX2INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
20 __min_vector_width__(256)))
21#define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
23 __min_vector_width__(128)))
24
25#if defined(__cplusplus) && (__cplusplus >= 201103L)
26#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
27#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
28#else
29#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
30#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
31#endif
32
33/* SSE4 Multiple Packed Sums of Absolute Difference. */
34/// Computes sixteen sum of absolute difference (SAD) operations on sets of
35/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
36/// \a Y.
37///
38/// Eight SAD results are computed using the lower half of the input
39/// vectors, and another eight using the upper half. These 16-bit values
40/// are returned in the lower and upper halves of the 256-bit result,
41/// respectively.
42///
43/// A single SAD operation selects four bytes from \a X and four bytes from
44/// \a Y as input. It computes the differences between each \a X byte and
45/// the corresponding \a Y byte, takes the absolute value of each
46/// difference, and sums these four values to form one 16-bit result. The
47/// intrinsic computes 16 of these results with different sets of input
48/// bytes.
49///
50/// For each set of eight results, the SAD operations use the same four
51/// bytes from \a Y; the starting bit position for these four bytes is
52/// specified by \a M[1:0] times 32. The eight operations use successive
53/// sets of four bytes from \a X; the starting bit position for the first
54/// set of four bytes is specified by \a M[2] times 32. These bit positions
55/// are all relative to the 128-bit lane for each set of eight operations.
56///
57/// \code{.operation}
58/// r := 0
59/// FOR i := 0 TO 1
60/// j := i*3
61/// Ybase := M[j+1:j]*32 + i*128
62/// Xbase := M[j+2]*32 + i*128
63/// FOR k := 0 TO 3
64/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
65/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
66/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
67/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
68/// result[r+15:r] := temp0 + temp1 + temp2 + temp3
69/// Xbase := Xbase + 8
70/// r := r + 16
71/// ENDFOR
72/// ENDFOR
73/// \endcode
74///
75/// \headerfile <immintrin.h>
76///
77/// \code
78/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
79/// \endcode
80///
81/// This intrinsic corresponds to the \c VMPSADBW instruction.
82///
83/// \param X
84/// A 256-bit integer vector containing one of the inputs.
85/// \param Y
86/// A 256-bit integer vector containing one of the inputs.
87/// \param M
88/// An unsigned immediate value specifying the starting positions of the
89/// bytes to operate on.
90/// \returns A 256-bit vector of [16 x i16] containing the result.
91#define _mm256_mpsadbw_epu8(X, Y, M) \
92 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
93 (__v32qi)(__m256i)(Y), (int)(M)))
94
95/// Computes the absolute value of each signed byte in the 256-bit integer
96/// vector \a __a and returns each value in the corresponding byte of
97/// the result.
98///
99/// \headerfile <immintrin.h>
100///
101/// This intrinsic corresponds to the \c VPABSB instruction.
102///
103/// \param __a
104/// A 256-bit integer vector.
105/// \returns A 256-bit integer vector containing the result.
106static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
108 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
109}
110
111/// Computes the absolute value of each signed 16-bit element in the 256-bit
112/// vector of [16 x i16] in \a __a and returns each value in the
113/// corresponding element of the result.
114///
115/// \headerfile <immintrin.h>
116///
117/// This intrinsic corresponds to the \c VPABSW instruction.
118///
119/// \param __a
120/// A 256-bit vector of [16 x i16].
121/// \returns A 256-bit vector of [16 x i16] containing the result.
122static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
124 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
125}
126
127/// Computes the absolute value of each signed 32-bit element in the 256-bit
128/// vector of [8 x i32] in \a __a and returns each value in the
129/// corresponding element of the result.
130///
131/// \headerfile <immintrin.h>
132///
133/// This intrinsic corresponds to the \c VPABSD instruction.
134///
135/// \param __a
136/// A 256-bit vector of [8 x i32].
137/// \returns A 256-bit vector of [8 x i32] containing the result.
138static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
140 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
141}
142
143/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
144/// integers using signed saturation, and returns the 256-bit result.
145///
146/// \code{.operation}
147/// FOR i := 0 TO 7
148/// j := i*16
149/// k := i*8
150/// result[7+k:k] := SATURATE8(__a[15+j:j])
151/// result[71+k:64+k] := SATURATE8(__b[15+j:j])
152/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
153/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
154/// ENDFOR
155/// \endcode
156///
157/// \headerfile <immintrin.h>
158///
159/// This intrinsic corresponds to the \c VPACKSSWB instruction.
160///
161/// \param __a
162/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
163/// result[191:128].
164/// \param __b
165/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
166/// result[255:192].
167/// \returns A 256-bit integer vector containing the result.
168static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
169_mm256_packs_epi16(__m256i __a, __m256i __b) {
170 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
171}
172
173/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
174/// integers using signed saturation, and returns the resulting 256-bit
175/// vector of [16 x i16].
176///
177/// \code{.operation}
178/// FOR i := 0 TO 3
179/// j := i*32
180/// k := i*16
181/// result[15+k:k] := SATURATE16(__a[31+j:j])
182/// result[79+k:64+k] := SATURATE16(__b[31+j:j])
183/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
184/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
185/// ENDFOR
186/// \endcode
187///
188/// \headerfile <immintrin.h>
189///
190/// This intrinsic corresponds to the \c VPACKSSDW instruction.
191///
192/// \param __a
193/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
194/// result[191:128].
195/// \param __b
196/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
197/// result[255:192].
198/// \returns A 256-bit vector of [16 x i16] containing the result.
199static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
200_mm256_packs_epi32(__m256i __a, __m256i __b) {
201 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
202}
203
204/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
205/// using unsigned saturation, and returns the 256-bit result.
206///
207/// \code{.operation}
208/// FOR i := 0 TO 7
209/// j := i*16
210/// k := i*8
211/// result[7+k:k] := SATURATE8U(__a[15+j:j])
212/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
213/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
214/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
215/// ENDFOR
216/// \endcode
217///
218/// \headerfile <immintrin.h>
219///
220/// This intrinsic corresponds to the \c VPACKUSWB instruction.
221///
222/// \param __a
223/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
224/// result[191:128].
225/// \param __b
226/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
227/// result[255:192].
228/// \returns A 256-bit integer vector containing the result.
229static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
230_mm256_packus_epi16(__m256i __a, __m256i __b) {
231 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
232}
233
234/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
235/// using unsigned saturation, and returns the resulting 256-bit vector of
236/// [16 x i16].
237///
238/// \code{.operation}
239/// FOR i := 0 TO 3
240/// j := i*32
241/// k := i*16
242/// result[15+k:k] := SATURATE16U(__V1[31+j:j])
243/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
244/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
245/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
246/// ENDFOR
247/// \endcode
248///
249/// \headerfile <immintrin.h>
250///
251/// This intrinsic corresponds to the \c VPACKUSDW instruction.
252///
253/// \param __V1
254/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
255/// result[191:128].
256/// \param __V2
257/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
258/// result[255:192].
259/// \returns A 256-bit vector of [16 x i16] containing the result.
260static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
261_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
262 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
263}
264
265/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
266/// vectors and returns the lower 8 bits of each sum in the corresponding
267/// byte of the 256-bit integer vector result (overflow is ignored).
268///
269/// \headerfile <immintrin.h>
270///
271/// This intrinsic corresponds to the \c VPADDB instruction.
272///
273/// \param __a
274/// A 256-bit integer vector containing one of the source operands.
275/// \param __b
276/// A 256-bit integer vector containing one of the source operands.
277/// \returns A 256-bit integer vector containing the sums.
278static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
279_mm256_add_epi8(__m256i __a, __m256i __b) {
280 return (__m256i)((__v32qu)__a + (__v32qu)__b);
281}
282
283/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284/// [16 x i16] and returns the lower 16 bits of each sum in the
285/// corresponding element of the [16 x i16] result (overflow is ignored).
286///
287/// \headerfile <immintrin.h>
288///
289/// This intrinsic corresponds to the \c VPADDW instruction.
290///
291/// \param __a
292/// A 256-bit vector of [16 x i16] containing one of the source operands.
293/// \param __b
294/// A 256-bit vector of [16 x i16] containing one of the source operands.
295/// \returns A 256-bit vector of [16 x i16] containing the sums.
296static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
297_mm256_add_epi16(__m256i __a, __m256i __b) {
298 return (__m256i)((__v16hu)__a + (__v16hu)__b);
299}
300
301/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
302/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
303/// element of the [8 x i32] result (overflow is ignored).
304///
305/// \headerfile <immintrin.h>
306///
307/// This intrinsic corresponds to the \c VPADDD instruction.
308///
309/// \param __a
310/// A 256-bit vector of [8 x i32] containing one of the source operands.
311/// \param __b
312/// A 256-bit vector of [8 x i32] containing one of the source operands.
313/// \returns A 256-bit vector of [8 x i32] containing the sums.
314static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
315_mm256_add_epi32(__m256i __a, __m256i __b) {
316 return (__m256i)((__v8su)__a + (__v8su)__b);
317}
318
319/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
320/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
321/// element of the [4 x i64] result (overflow is ignored).
322///
323/// \headerfile <immintrin.h>
324///
325/// This intrinsic corresponds to the \c VPADDQ instruction.
326///
327/// \param __a
328/// A 256-bit vector of [4 x i64] containing one of the source operands.
329/// \param __b
330/// A 256-bit vector of [4 x i64] containing one of the source operands.
331/// \returns A 256-bit vector of [4 x i64] containing the sums.
332static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
333_mm256_add_epi64(__m256i __a, __m256i __b) {
334 return (__m256i)((__v4du)__a + (__v4du)__b);
335}
336
337/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
338/// vectors using signed saturation, and returns each sum in the
339/// corresponding byte of the 256-bit integer vector result.
340///
341/// \headerfile <immintrin.h>
342///
343/// This intrinsic corresponds to the \c VPADDSB instruction.
344///
345/// \param __a
346/// A 256-bit integer vector containing one of the source operands.
347/// \param __b
348/// A 256-bit integer vector containing one of the source operands.
349/// \returns A 256-bit integer vector containing the sums.
350static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
351_mm256_adds_epi8(__m256i __a, __m256i __b) {
352 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
353}
354
355/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
356/// [16 x i16] using signed saturation, and returns the [16 x i16] result.
357///
358/// \headerfile <immintrin.h>
359///
360/// This intrinsic corresponds to the \c VPADDSW instruction.
361///
362/// \param __a
363/// A 256-bit vector of [16 x i16] containing one of the source operands.
364/// \param __b
365/// A 256-bit vector of [16 x i16] containing one of the source operands.
366/// \returns A 256-bit vector of [16 x i16] containing the sums.
367static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
368_mm256_adds_epi16(__m256i __a, __m256i __b) {
369 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
370}
371
372/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
373/// vectors using unsigned saturation, and returns each sum in the
374/// corresponding byte of the 256-bit integer vector result.
375///
376/// \headerfile <immintrin.h>
377///
378/// This intrinsic corresponds to the \c VPADDUSB instruction.
379///
380/// \param __a
381/// A 256-bit integer vector containing one of the source operands.
382/// \param __b
383/// A 256-bit integer vector containing one of the source operands.
384/// \returns A 256-bit integer vector containing the sums.
385static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
386_mm256_adds_epu8(__m256i __a, __m256i __b) {
387 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
388}
389
390/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
391/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
392///
393/// \headerfile <immintrin.h>
394///
395/// This intrinsic corresponds to the \c VPADDUSW instruction.
396///
397/// \param __a
398/// A 256-bit vector of [16 x i16] containing one of the source operands.
399/// \param __b
400/// A 256-bit vector of [16 x i16] containing one of the source operands.
401/// \returns A 256-bit vector of [16 x i16] containing the sums.
402static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
403_mm256_adds_epu16(__m256i __a, __m256i __b) {
404 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
405}
406
407/// Uses the lower half of the 256-bit vector \a a as the upper half of a
408/// temporary 256-bit value, and the lower half of the 256-bit vector \a b
409/// as the lower half of the temporary value. Right-shifts the temporary
410/// value by \a n bytes, and uses the lower 16 bytes of the shifted value
411/// as the lower 16 bytes of the result. Uses the upper halves of \a a and
412/// \a b to make another temporary value, right shifts by \a n, and uses
413/// the lower 16 bytes of the shifted value as the upper 16 bytes of the
414/// result.
415///
416/// \headerfile <immintrin.h>
417///
418/// \code
419/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
420/// \endcode
421///
422/// This intrinsic corresponds to the \c VPALIGNR instruction.
423///
424/// \param a
425/// A 256-bit integer vector containing source values.
426/// \param b
427/// A 256-bit integer vector containing source values.
428/// \param n
429/// An immediate value specifying the number of bytes to shift.
430/// \returns A 256-bit integer vector containing the result.
431#define _mm256_alignr_epi8(a, b, n) \
432 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
433 (__v32qi)(__m256i)(b), (n)))
434
435/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
436/// \a __b.
437///
438/// \headerfile <immintrin.h>
439///
440/// This intrinsic corresponds to the \c VPAND instruction.
441///
442/// \param __a
443/// A 256-bit integer vector.
444/// \param __b
445/// A 256-bit integer vector.
446/// \returns A 256-bit integer vector containing the result.
447static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
448_mm256_and_si256(__m256i __a, __m256i __b)
449{
450 return (__m256i)((__v4du)__a & (__v4du)__b);
451}
452
453/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
454/// the bitwise NOT of the 256-bit integer vector in \a __a.
455///
456/// \headerfile <immintrin.h>
457///
458/// This intrinsic corresponds to the \c VPANDN instruction.
459///
460/// \param __a
461/// A 256-bit integer vector.
462/// \param __b
463/// A 256-bit integer vector.
464/// \returns A 256-bit integer vector containing the result.
465static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
466_mm256_andnot_si256(__m256i __a, __m256i __b)
467{
468 return (__m256i)(~(__v4du)__a & (__v4du)__b);
469}
470
471/// Computes the averages of the corresponding unsigned bytes in the two
472/// 256-bit integer vectors in \a __a and \a __b and returns each
473/// average in the corresponding byte of the 256-bit result.
474///
475/// \code{.operation}
476/// FOR i := 0 TO 31
477/// j := i*8
478/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
479/// ENDFOR
480/// \endcode
481///
482/// \headerfile <immintrin.h>
483///
484/// This intrinsic corresponds to the \c VPAVGB instruction.
485///
486/// \param __a
487/// A 256-bit integer vector.
488/// \param __b
489/// A 256-bit integer vector.
490/// \returns A 256-bit integer vector containing the result.
491static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
492_mm256_avg_epu8(__m256i __a, __m256i __b) {
493 return (__m256i)__builtin_ia32_pavgb256((__v32qu)__a, (__v32qu)__b);
494}
495
496/// Computes the averages of the corresponding unsigned 16-bit integers in
497/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
498/// each average in the corresponding element of the 256-bit result.
499///
500/// \code{.operation}
501/// FOR i := 0 TO 15
502/// j := i*16
503/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
504/// ENDFOR
505/// \endcode
506///
507/// \headerfile <immintrin.h>
508///
509/// This intrinsic corresponds to the \c VPAVGW instruction.
510///
511/// \param __a
512/// A 256-bit vector of [16 x i16].
513/// \param __b
514/// A 256-bit vector of [16 x i16].
515/// \returns A 256-bit vector of [16 x i16] containing the result.
516static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
517_mm256_avg_epu16(__m256i __a, __m256i __b) {
518 return (__m256i)__builtin_ia32_pavgw256((__v16hu)__a, (__v16hu)__b);
519}
520
521/// Merges 8-bit integer values from either of the two 256-bit vectors
522/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
523/// the resulting 256-bit integer vector.
524///
525/// \code{.operation}
526/// FOR i := 0 TO 31
527/// j := i*8
528/// IF __M[7+i] == 0
529/// result[7+j:j] := __V1[7+j:j]
530/// ELSE
531/// result[7+j:j] := __V2[7+j:j]
532/// FI
533/// ENDFOR
534/// \endcode
535///
536/// \headerfile <immintrin.h>
537///
538/// This intrinsic corresponds to the \c VPBLENDVB instruction.
539///
540/// \param __V1
541/// A 256-bit integer vector containing source values.
542/// \param __V2
543/// A 256-bit integer vector containing source values.
544/// \param __M
545/// A 256-bit integer vector, with bit [7] of each byte specifying the
546/// source for each corresponding byte of the result. When the mask bit
547/// is 0, the byte is copied from \a __V1; otherwise, it is copied from
548/// \a __V2.
549/// \returns A 256-bit integer vector containing the result.
550static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
551_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) {
552 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
553 (__v32qi)__M);
554}
555
556/// Merges 16-bit integer values from either of the two 256-bit vectors
557/// \a V1 or \a V2, as specified by the immediate integer operand \a M,
558/// and returns the resulting 256-bit vector of [16 x i16].
559///
560/// \code{.operation}
561/// FOR i := 0 TO 7
562/// j := i*16
563/// IF M[i] == 0
564/// result[7+j:j] := V1[7+j:j]
565/// result[135+j:128+j] := V1[135+j:128+j]
566/// ELSE
567/// result[7+j:j] := V2[7+j:j]
568/// result[135+j:128+j] := V2[135+j:128+j]
569/// FI
570/// ENDFOR
571/// \endcode
572///
573/// \headerfile <immintrin.h>
574///
575/// \code
576/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
577/// \endcode
578///
579/// This intrinsic corresponds to the \c VPBLENDW instruction.
580///
581/// \param V1
582/// A 256-bit vector of [16 x i16] containing source values.
583/// \param V2
584/// A 256-bit vector of [16 x i16] containing source values.
585/// \param M
586/// An immediate 8-bit integer operand, with bits [7:0] specifying the
587/// source for each element of the result. The position of the mask bit
588/// corresponds to the index of a copied value. When a mask bit is 0, the
589/// element is copied from \a V1; otherwise, it is copied from \a V2.
590/// \a M[0] determines the source for elements 0 and 8, \a M[1] for
591/// elements 1 and 9, and so forth.
592/// \returns A 256-bit vector of [16 x i16] containing the result.
593#define _mm256_blend_epi16(V1, V2, M) \
594 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
595 (__v16hi)(__m256i)(V2), (int)(M)))
596
597/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
598/// \a __b for equality and returns the outcomes in the corresponding
599/// bytes of the 256-bit result.
600///
601/// \code{.operation}
602/// FOR i := 0 TO 31
603/// j := i*8
604/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
605/// ENDFOR
606/// \endcode
607///
608/// \headerfile <immintrin.h>
609///
610/// This intrinsic corresponds to the \c VPCMPEQB instruction.
611///
612/// \param __a
613/// A 256-bit integer vector containing one of the inputs.
614/// \param __b
615/// A 256-bit integer vector containing one of the inputs.
616/// \returns A 256-bit integer vector containing the result.
617static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
618_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
619{
620 return (__m256i)((__v32qi)__a == (__v32qi)__b);
621}
622
623/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
624/// \a __a and \a __b for equality and returns the outcomes in the
625/// corresponding elements of the 256-bit result.
626///
627/// \code{.operation}
628/// FOR i := 0 TO 15
629/// j := i*16
630/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
631/// ENDFOR
632/// \endcode
633///
634/// \headerfile <immintrin.h>
635///
636/// This intrinsic corresponds to the \c VPCMPEQW instruction.
637///
638/// \param __a
639/// A 256-bit vector of [16 x i16] containing one of the inputs.
640/// \param __b
641/// A 256-bit vector of [16 x i16] containing one of the inputs.
642/// \returns A 256-bit vector of [16 x i16] containing the result.
643static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
644_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
645{
646 return (__m256i)((__v16hi)__a == (__v16hi)__b);
647}
648
649/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
650/// \a __a and \a __b for equality and returns the outcomes in the
651/// corresponding elements of the 256-bit result.
652///
653/// \code{.operation}
654/// FOR i := 0 TO 7
655/// j := i*32
656/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
657/// ENDFOR
658/// \endcode
659///
660/// \headerfile <immintrin.h>
661///
662/// This intrinsic corresponds to the \c VPCMPEQD instruction.
663///
664/// \param __a
665/// A 256-bit vector of [8 x i32] containing one of the inputs.
666/// \param __b
667/// A 256-bit vector of [8 x i32] containing one of the inputs.
668/// \returns A 256-bit vector of [8 x i32] containing the result.
669static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
670_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
671{
672 return (__m256i)((__v8si)__a == (__v8si)__b);
673}
674
675/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
676/// \a __a and \a __b for equality and returns the outcomes in the
677/// corresponding elements of the 256-bit result.
678///
679/// \code{.operation}
680/// FOR i := 0 TO 3
681/// j := i*64
682/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
683/// ENDFOR
684/// \endcode
685///
686/// \headerfile <immintrin.h>
687///
688/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
689///
690/// \param __a
691/// A 256-bit vector of [4 x i64] containing one of the inputs.
692/// \param __b
693/// A 256-bit vector of [4 x i64] containing one of the inputs.
694/// \returns A 256-bit vector of [4 x i64] containing the result.
695static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
696_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
697{
698 return (__m256i)((__v4di)__a == (__v4di)__b);
699}
700
701/// Compares corresponding signed bytes in the 256-bit integer vectors in
702/// \a __a and \a __b for greater-than and returns the outcomes in the
703/// corresponding bytes of the 256-bit result.
704///
705/// \code{.operation}
706/// FOR i := 0 TO 31
707/// j := i*8
708/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
709/// ENDFOR
710/// \endcode
711///
712/// \headerfile <immintrin.h>
713///
714/// This intrinsic corresponds to the \c VPCMPGTB instruction.
715///
716/// \param __a
717/// A 256-bit integer vector containing one of the inputs.
718/// \param __b
719/// A 256-bit integer vector containing one of the inputs.
720/// \returns A 256-bit integer vector containing the result.
721static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
722_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
723{
724 /* This function always performs a signed comparison, but __v32qi is a char
725 which may be signed or unsigned, so use __v32qs. */
726 return (__m256i)((__v32qs)__a > (__v32qs)__b);
727}
728
729/// Compares corresponding signed elements in the 256-bit vectors of
730/// [16 x i16] in \a __a and \a __b for greater-than and returns the
731/// outcomes in the corresponding elements of the 256-bit result.
732///
733/// \code{.operation}
734/// FOR i := 0 TO 15
735/// j := i*16
736/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
737/// ENDFOR
738/// \endcode
739///
740/// \headerfile <immintrin.h>
741///
742/// This intrinsic corresponds to the \c VPCMPGTW instruction.
743///
744/// \param __a
745/// A 256-bit vector of [16 x i16] containing one of the inputs.
746/// \param __b
747/// A 256-bit vector of [16 x i16] containing one of the inputs.
748/// \returns A 256-bit vector of [16 x i16] containing the result.
749static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
750_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
751{
752 return (__m256i)((__v16hi)__a > (__v16hi)__b);
753}
754
755/// Compares corresponding signed elements in the 256-bit vectors of
756/// [8 x i32] in \a __a and \a __b for greater-than and returns the
757/// outcomes in the corresponding elements of the 256-bit result.
758///
759/// \code{.operation}
760/// FOR i := 0 TO 7
761/// j := i*32
762/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
763/// ENDFOR
764/// \endcode
765///
766/// \headerfile <immintrin.h>
767///
768/// This intrinsic corresponds to the \c VPCMPGTD instruction.
769///
770/// \param __a
771/// A 256-bit vector of [8 x i32] containing one of the inputs.
772/// \param __b
773/// A 256-bit vector of [8 x i32] containing one of the inputs.
774/// \returns A 256-bit vector of [8 x i32] containing the result.
775static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
776_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
777{
778 return (__m256i)((__v8si)__a > (__v8si)__b);
779}
780
781/// Compares corresponding signed elements in the 256-bit vectors of
782/// [4 x i64] in \a __a and \a __b for greater-than and returns the
783/// outcomes in the corresponding elements of the 256-bit result.
784///
785/// \code{.operation}
786/// FOR i := 0 TO 3
787/// j := i*64
788/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
789/// ENDFOR
790/// \endcode
791///
792/// \headerfile <immintrin.h>
793///
794/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
795///
796/// \param __a
797/// A 256-bit vector of [4 x i64] containing one of the inputs.
798/// \param __b
799/// A 256-bit vector of [4 x i64] containing one of the inputs.
800/// \returns A 256-bit vector of [4 x i64] containing the result.
801static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
802_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
803{
804 return (__m256i)((__v4di)__a > (__v4di)__b);
805}
806
807/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
808/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
809/// element of the [16 x i16] result (overflow is ignored). Sums from
810/// \a __a are returned in the lower 64 bits of each 128-bit half of the
811/// result; sums from \a __b are returned in the upper 64 bits of each
812/// 128-bit half of the result.
813///
814/// \code{.operation}
815/// FOR i := 0 TO 1
816/// j := i*128
817/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
818/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
819/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
820/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
821/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
822/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
823/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
824/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
825/// ENDFOR
826/// \endcode
827///
828/// \headerfile <immintrin.h>
829///
830/// This intrinsic corresponds to the \c VPHADDW instruction.
831///
832/// \param __a
833/// A 256-bit vector of [16 x i16] containing one of the source operands.
834/// \param __b
835/// A 256-bit vector of [16 x i16] containing one of the source operands.
836/// \returns A 256-bit vector of [16 x i16] containing the sums.
837static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
838_mm256_hadd_epi16(__m256i __a, __m256i __b) {
839 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
840}
841
842/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
843/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
844/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
845/// are returned in the lower 64 bits of each 128-bit half of the result;
846/// sums from \a __b are returned in the upper 64 bits of each 128-bit half
847/// of the result.
848///
849/// \code{.operation}
850/// FOR i := 0 TO 1
851/// j := i*128
852/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
853/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
854/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
855/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
856/// ENDFOR
857/// \endcode
858///
859/// \headerfile <immintrin.h>
860///
861/// This intrinsic corresponds to the \c VPHADDD instruction.
862///
863/// \param __a
864/// A 256-bit vector of [8 x i32] containing one of the source operands.
865/// \param __b
866/// A 256-bit vector of [8 x i32] containing one of the source operands.
867/// \returns A 256-bit vector of [8 x i32] containing the sums.
868static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
869_mm256_hadd_epi32(__m256i __a, __m256i __b) {
870 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
871}
872
873/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
874/// vectors of [16 x i16] using signed saturation and returns each sum in
875/// an element of the [16 x i16] result. Sums from \a __a are returned in
876/// the lower 64 bits of each 128-bit half of the result; sums from \a __b
877/// are returned in the upper 64 bits of each 128-bit half of the result.
878///
879/// \code{.operation}
880/// FOR i := 0 TO 1
881/// j := i*128
882/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
883/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
884/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
885/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
886/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
887/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
888/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
889/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
890/// ENDFOR
891/// \endcode
892///
893/// \headerfile <immintrin.h>
894///
895/// This intrinsic corresponds to the \c VPHADDSW instruction.
896///
897/// \param __a
898/// A 256-bit vector of [16 x i16] containing one of the source operands.
899/// \param __b
900/// A 256-bit vector of [16 x i16] containing one of the source operands.
901/// \returns A 256-bit vector of [16 x i16] containing the sums.
902static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
903_mm256_hadds_epi16(__m256i __a, __m256i __b) {
904 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
905}
906
907/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
908/// vectors of [16 x i16] and returns the lower 16 bits of each difference
909/// in an element of the [16 x i16] result (overflow is ignored).
910/// Differences from \a __a are returned in the lower 64 bits of each
911/// 128-bit half of the result; differences from \a __b are returned in the
912/// upper 64 bits of each 128-bit half of the result.
913///
914/// \code{.operation}
915/// FOR i := 0 TO 1
916/// j := i*128
917/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
918/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
919/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
920/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
921/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
922/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
923/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
924/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
925/// ENDFOR
926/// \endcode
927///
928/// \headerfile <immintrin.h>
929///
930/// This intrinsic corresponds to the \c VPHSUBW instruction.
931///
932/// \param __a
933/// A 256-bit vector of [16 x i16] containing one of the source operands.
934/// \param __b
935/// A 256-bit vector of [16 x i16] containing one of the source operands.
936/// \returns A 256-bit vector of [16 x i16] containing the differences.
937static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
938_mm256_hsub_epi16(__m256i __a, __m256i __b) {
939 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
940}
941
942/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
943/// vectors of [8 x i32] and returns the lower 32 bits of each difference in
944/// an element of the [8 x i32] result (overflow is ignored). Differences
945/// from \a __a are returned in the lower 64 bits of each 128-bit half of
946/// the result; differences from \a __b are returned in the upper 64 bits
947/// of each 128-bit half of the result.
948///
949/// \code{.operation}
950/// FOR i := 0 TO 1
951/// j := i*128
952/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
953/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
954/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
955/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
956/// ENDFOR
957/// \endcode
958///
959/// \headerfile <immintrin.h>
960///
961/// This intrinsic corresponds to the \c VPHSUBD instruction.
962///
963/// \param __a
964/// A 256-bit vector of [8 x i32] containing one of the source operands.
965/// \param __b
966/// A 256-bit vector of [8 x i32] containing one of the source operands.
967/// \returns A 256-bit vector of [8 x i32] containing the differences.
968static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
969_mm256_hsub_epi32(__m256i __a, __m256i __b) {
970 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
971}
972
973/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
974/// vectors of [16 x i16] using signed saturation and returns each sum in
975/// an element of the [16 x i16] result. Differences from \a __a are
976/// returned in the lower 64 bits of each 128-bit half of the result;
977/// differences from \a __b are returned in the upper 64 bits of each
978/// 128-bit half of the result.
979///
980/// \code{.operation}
981/// FOR i := 0 TO 1
982/// j := i*128
983/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
984/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
985/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
986/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
987/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
988/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
989/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
990/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
991/// ENDFOR
992/// \endcode
993///
994/// \headerfile <immintrin.h>
995///
996/// This intrinsic corresponds to the \c VPHSUBSW instruction.
997///
998/// \param __a
999/// A 256-bit vector of [16 x i16] containing one of the source operands.
1000/// \param __b
1001/// A 256-bit vector of [16 x i16] containing one of the source operands.
1002/// \returns A 256-bit vector of [16 x i16] containing the differences.
1003static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1004_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
1005 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1006}
1007
1008/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1009/// with the corresponding signed byte from the 256-bit integer vector in
1010/// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1011/// pairs of those products using signed saturation to form 16-bit sums
1012/// returned as elements of the [16 x i16] result.
1013///
1014/// \code{.operation}
1015/// FOR i := 0 TO 15
1016/// j := i*16
1017/// temp1 := __a[j+7:j] * __b[j+7:j]
1018/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1019/// result[j+15:j] := SATURATE16(temp1 + temp2)
1020/// ENDFOR
1021/// \endcode
1022///
1023/// \headerfile <immintrin.h>
1024///
1025/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1026///
1027/// \param __a
1028/// A 256-bit vector containing one of the source operands.
1029/// \param __b
1030/// A 256-bit vector containing one of the source operands.
1031/// \returns A 256-bit vector of [16 x i16] containing the result.
1032static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1033_mm256_maddubs_epi16(__m256i __a, __m256i __b) {
1034 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1035}
1036
1037/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1038/// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1039/// those products to form 32-bit sums returned as elements of the
1040/// [8 x i32] result.
1041///
1042/// There is only one wraparound case: when all four of the 16-bit sources
1043/// are \c 0x8000, the result will be \c 0x80000000.
1044///
1045/// \code{.operation}
1046/// FOR i := 0 TO 7
1047/// j := i*32
1048/// temp1 := __a[j+15:j] * __b[j+15:j]
1049/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1050/// result[j+31:j] := temp1 + temp2
1051/// ENDFOR
1052/// \endcode
1053///
1054/// \headerfile <immintrin.h>
1055///
1056/// This intrinsic corresponds to the \c VPMADDWD instruction.
1057///
1058/// \param __a
1059/// A 256-bit vector of [16 x i16] containing one of the source operands.
1060/// \param __b
1061/// A 256-bit vector of [16 x i16] containing one of the source operands.
1062/// \returns A 256-bit vector of [8 x i32] containing the result.
1063static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1064_mm256_madd_epi16(__m256i __a, __m256i __b) {
1065 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1066}
1067
1068/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1069/// in \a __a and \a __b and returns the larger of each pair in the
1070/// corresponding byte of the 256-bit result.
1071///
1072/// \headerfile <immintrin.h>
1073///
1074/// This intrinsic corresponds to the \c VPMAXSB instruction.
1075///
1076/// \param __a
1077/// A 256-bit integer vector.
1078/// \param __b
1079/// A 256-bit integer vector.
1080/// \returns A 256-bit integer vector containing the result.
1081static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1082_mm256_max_epi8(__m256i __a, __m256i __b) {
1083 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1084}
1085
1086/// Compares the corresponding signed 16-bit integers in the two 256-bit
1087/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1088/// each pair in the corresponding element of the 256-bit result.
1089///
1090/// \headerfile <immintrin.h>
1091///
1092/// This intrinsic corresponds to the \c VPMAXSW instruction.
1093///
1094/// \param __a
1095/// A 256-bit vector of [16 x i16].
1096/// \param __b
1097/// A 256-bit vector of [16 x i16].
1098/// \returns A 256-bit vector of [16 x i16] containing the result.
1099static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1100_mm256_max_epi16(__m256i __a, __m256i __b) {
1101 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1102}
1103
1104/// Compares the corresponding signed 32-bit integers in the two 256-bit
1105/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1106/// each pair in the corresponding element of the 256-bit result.
1107///
1108/// \headerfile <immintrin.h>
1109///
1110/// This intrinsic corresponds to the \c VPMAXSD instruction.
1111///
1112/// \param __a
1113/// A 256-bit vector of [8 x i32].
1114/// \param __b
1115/// A 256-bit vector of [8 x i32].
1116/// \returns A 256-bit vector of [8 x i32] containing the result.
1117static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1118_mm256_max_epi32(__m256i __a, __m256i __b) {
1119 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1120}
1121
1122/// Compares the corresponding unsigned bytes in the two 256-bit integer
1123/// vectors in \a __a and \a __b and returns the larger of each pair in
1124/// the corresponding byte of the 256-bit result.
1125///
1126/// \headerfile <immintrin.h>
1127///
1128/// This intrinsic corresponds to the \c VPMAXUB instruction.
1129///
1130/// \param __a
1131/// A 256-bit integer vector.
1132/// \param __b
1133/// A 256-bit integer vector.
1134/// \returns A 256-bit integer vector containing the result.
1135static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1136_mm256_max_epu8(__m256i __a, __m256i __b) {
1137 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1138}
1139
1140/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1141/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1142/// each pair in the corresponding element of the 256-bit result.
1143///
1144/// \headerfile <immintrin.h>
1145///
1146/// This intrinsic corresponds to the \c VPMAXUW instruction.
1147///
1148/// \param __a
1149/// A 256-bit vector of [16 x i16].
1150/// \param __b
1151/// A 256-bit vector of [16 x i16].
1152/// \returns A 256-bit vector of [16 x i16] containing the result.
1153static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1154_mm256_max_epu16(__m256i __a, __m256i __b) {
1155 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1156}
1157
1158/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1159/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1160/// each pair in the corresponding element of the 256-bit result.
1161///
1162/// \headerfile <immintrin.h>
1163///
1164/// This intrinsic corresponds to the \c VPMAXUD instruction.
1165///
1166/// \param __a
1167/// A 256-bit vector of [8 x i32].
1168/// \param __b
1169/// A 256-bit vector of [8 x i32].
1170/// \returns A 256-bit vector of [8 x i32] containing the result.
1171static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1172_mm256_max_epu32(__m256i __a, __m256i __b) {
1173 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1174}
1175
1176/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1177/// in \a __a and \a __b and returns the smaller of each pair in the
1178/// corresponding byte of the 256-bit result.
1179///
1180/// \headerfile <immintrin.h>
1181///
1182/// This intrinsic corresponds to the \c VPMINSB instruction.
1183///
1184/// \param __a
1185/// A 256-bit integer vector.
1186/// \param __b
1187/// A 256-bit integer vector.
1188/// \returns A 256-bit integer vector containing the result.
1189static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1190_mm256_min_epi8(__m256i __a, __m256i __b) {
1191 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1192}
1193
1194/// Compares the corresponding signed 16-bit integers in the two 256-bit
1195/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1196/// each pair in the corresponding element of the 256-bit result.
1197///
1198/// \headerfile <immintrin.h>
1199///
1200/// This intrinsic corresponds to the \c VPMINSW instruction.
1201///
1202/// \param __a
1203/// A 256-bit vector of [16 x i16].
1204/// \param __b
1205/// A 256-bit vector of [16 x i16].
1206/// \returns A 256-bit vector of [16 x i16] containing the result.
1207static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1208_mm256_min_epi16(__m256i __a, __m256i __b) {
1209 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1210}
1211
1212/// Compares the corresponding signed 32-bit integers in the two 256-bit
1213/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1214/// each pair in the corresponding element of the 256-bit result.
1215///
1216/// \headerfile <immintrin.h>
1217///
1218/// This intrinsic corresponds to the \c VPMINSD instruction.
1219///
1220/// \param __a
1221/// A 256-bit vector of [8 x i32].
1222/// \param __b
1223/// A 256-bit vector of [8 x i32].
1224/// \returns A 256-bit vector of [8 x i32] containing the result.
1225static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1226_mm256_min_epi32(__m256i __a, __m256i __b) {
1227 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1228}
1229
1230/// Compares the corresponding unsigned bytes in the two 256-bit integer
1231/// vectors in \a __a and \a __b and returns the smaller of each pair in
1232/// the corresponding byte of the 256-bit result.
1233///
1234/// \headerfile <immintrin.h>
1235///
1236/// This intrinsic corresponds to the \c VPMINUB instruction.
1237///
1238/// \param __a
1239/// A 256-bit integer vector.
1240/// \param __b
1241/// A 256-bit integer vector.
1242/// \returns A 256-bit integer vector containing the result.
1243static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1244_mm256_min_epu8(__m256i __a, __m256i __b) {
1245 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1246}
1247
1248/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1249/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1250/// each pair in the corresponding element of the 256-bit result.
1251///
1252/// \headerfile <immintrin.h>
1253///
1254/// This intrinsic corresponds to the \c VPMINUW instruction.
1255///
1256/// \param __a
1257/// A 256-bit vector of [16 x i16].
1258/// \param __b
1259/// A 256-bit vector of [16 x i16].
1260/// \returns A 256-bit vector of [16 x i16] containing the result.
1261static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1262_mm256_min_epu16(__m256i __a, __m256i __b) {
1263 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1264}
1265
1266/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1267/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1268/// each pair in the corresponding element of the 256-bit result.
1269///
1270/// \headerfile <immintrin.h>
1271///
1272/// This intrinsic corresponds to the \c VPMINUD instruction.
1273///
1274/// \param __a
1275/// A 256-bit vector of [8 x i32].
1276/// \param __b
1277/// A 256-bit vector of [8 x i32].
1278/// \returns A 256-bit vector of [8 x i32] containing the result.
1279static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1280_mm256_min_epu32(__m256i __a, __m256i __b) {
1281 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1282}
1283
1284/// Creates a 32-bit integer mask from the most significant bit of each byte
1285/// in the 256-bit integer vector in \a __a and returns the result.
1286///
1287/// \code{.operation}
1288/// FOR i := 0 TO 31
1289/// j := i*8
1290/// result[i] := __a[j+7]
1291/// ENDFOR
1292/// \endcode
1293///
1294/// \headerfile <immintrin.h>
1295///
1296/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1297///
1298/// \param __a
1299/// A 256-bit integer vector containing the source bytes.
1300/// \returns The 32-bit integer mask.
1301static __inline__ int __DEFAULT_FN_ATTRS256_CONSTEXPR
1303 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1304}
1305
1306/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1307/// the 16-bit values in the corresponding elements of a 256-bit vector
1308/// of [16 x i16].
1309///
1310/// \code{.operation}
1311/// FOR i := 0 TO 15
1312/// j := i*8
1313/// k := i*16
1314/// result[k+15:k] := SignExtend(__V[j+7:j])
1315/// ENDFOR
1316/// \endcode
1317///
1318/// \headerfile <immintrin.h>
1319///
1320/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1321///
1322/// \param __V
1323/// A 128-bit integer vector containing the source bytes.
1324/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1325/// values.
1326static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1328 /* This function always performs a signed extension, but __v16qi is a char
1329 which may be signed or unsigned, so use __v16qs. */
1330 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1331}
1332
1333/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1334/// \a __V and returns the 32-bit values in the corresponding elements of a
1335/// 256-bit vector of [8 x i32].
1336///
1337/// \code{.operation}
1338/// FOR i := 0 TO 7
1339/// j := i*8
1340/// k := i*32
1341/// result[k+31:k] := SignExtend(__V[j+7:j])
1342/// ENDFOR
1343/// \endcode
1344///
1345/// \headerfile <immintrin.h>
1346///
1347/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1348///
1349/// \param __V
1350/// A 128-bit integer vector containing the source bytes.
1351/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1352/// values.
1353static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1355 /* This function always performs a signed extension, but __v16qi is a char
1356 which may be signed or unsigned, so use __v16qs. */
1357 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1358}
1359
1360/// Sign-extends the first four bytes from the 128-bit integer vector in
1361/// \a __V and returns the 64-bit values in the corresponding elements of a
1362/// 256-bit vector of [4 x i64].
1363///
1364/// \code{.operation}
1365/// result[63:0] := SignExtend(__V[7:0])
1366/// result[127:64] := SignExtend(__V[15:8])
1367/// result[191:128] := SignExtend(__V[23:16])
1368/// result[255:192] := SignExtend(__V[31:24])
1369/// \endcode
1370///
1371/// \headerfile <immintrin.h>
1372///
1373/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1374///
1375/// \param __V
1376/// A 128-bit integer vector containing the source bytes.
1377/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1378/// values.
1379static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1381 /* This function always performs a signed extension, but __v16qi is a char
1382 which may be signed or unsigned, so use __v16qs. */
1383 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1384}
1385
1386/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1387/// \a __V and returns the 32-bit values in the corresponding elements of a
1388/// 256-bit vector of [8 x i32].
1389///
1390/// \code{.operation}
1391/// FOR i := 0 TO 7
1392/// j := i*16
1393/// k := i*32
1394/// result[k+31:k] := SignExtend(__V[j+15:j])
1395/// ENDFOR
1396/// \endcode
1397///
1398/// \headerfile <immintrin.h>
1399///
1400/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1401///
1402/// \param __V
1403/// A 128-bit vector of [8 x i16] containing the source values.
1404/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1405/// values.
1406static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1408 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1409}
1410
1411/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1412/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1413/// elements of a 256-bit vector of [4 x i64].
1414///
1415/// \code{.operation}
1416/// result[63:0] := SignExtend(__V[15:0])
1417/// result[127:64] := SignExtend(__V[31:16])
1418/// result[191:128] := SignExtend(__V[47:32])
1419/// result[255:192] := SignExtend(__V[64:48])
1420/// \endcode
1421///
1422/// \headerfile <immintrin.h>
1423///
1424/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1425///
1426/// \param __V
1427/// A 128-bit vector of [8 x i16] containing the source values.
1428/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1429/// values.
1430static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1432 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1433}
1434
1435/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1436/// \a __V and returns the 64-bit values in the corresponding elements of a
1437/// 256-bit vector of [4 x i64].
1438///
1439/// \code{.operation}
1440/// result[63:0] := SignExtend(__V[31:0])
1441/// result[127:64] := SignExtend(__V[63:32])
1442/// result[191:128] := SignExtend(__V[95:64])
1443/// result[255:192] := SignExtend(__V[127:96])
1444/// \endcode
1445///
1446/// \headerfile <immintrin.h>
1447///
1448/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1449///
1450/// \param __V
1451/// A 128-bit vector of [4 x i32] containing the source values.
1452/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1453/// values.
1454static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1456 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1457}
1458
1459/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1460/// the 16-bit values in the corresponding elements of a 256-bit vector
1461/// of [16 x i16].
1462///
1463/// \code{.operation}
1464/// FOR i := 0 TO 15
1465/// j := i*8
1466/// k := i*16
1467/// result[k+15:k] := ZeroExtend(__V[j+7:j])
1468/// ENDFOR
1469/// \endcode
1470///
1471/// \headerfile <immintrin.h>
1472///
1473/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1474///
1475/// \param __V
1476/// A 128-bit integer vector containing the source bytes.
1477/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1478/// values.
1479static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1481 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1482}
1483
1484/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1485/// \a __V and returns the 32-bit values in the corresponding elements of a
1486/// 256-bit vector of [8 x i32].
1487///
1488/// \code{.operation}
1489/// FOR i := 0 TO 7
1490/// j := i*8
1491/// k := i*32
1492/// result[k+31:k] := ZeroExtend(__V[j+7:j])
1493/// ENDFOR
1494/// \endcode
1495///
1496/// \headerfile <immintrin.h>
1497///
1498/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1499///
1500/// \param __V
1501/// A 128-bit integer vector containing the source bytes.
1502/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1503/// values.
1504static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1506 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1507}
1508
1509/// Zero-extends the first four bytes from the 128-bit integer vector in
1510/// \a __V and returns the 64-bit values in the corresponding elements of a
1511/// 256-bit vector of [4 x i64].
1512///
1513/// \code{.operation}
1514/// result[63:0] := ZeroExtend(__V[7:0])
1515/// result[127:64] := ZeroExtend(__V[15:8])
1516/// result[191:128] := ZeroExtend(__V[23:16])
1517/// result[255:192] := ZeroExtend(__V[31:24])
1518/// \endcode
1519///
1520/// \headerfile <immintrin.h>
1521///
1522/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1523///
1524/// \param __V
1525/// A 128-bit integer vector containing the source bytes.
1526/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1527/// values.
1528static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1530 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1531}
1532
1533/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1534/// \a __V and returns the 32-bit values in the corresponding elements of a
1535/// 256-bit vector of [8 x i32].
1536///
1537/// \code{.operation}
1538/// FOR i := 0 TO 7
1539/// j := i*16
1540/// k := i*32
1541/// result[k+31:k] := ZeroExtend(__V[j+15:j])
1542/// ENDFOR
1543/// \endcode
1544///
1545/// \headerfile <immintrin.h>
1546///
1547/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1548///
1549/// \param __V
1550/// A 128-bit vector of [8 x i16] containing the source values.
1551/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1552/// values.
1553static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1555 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1556}
1557
1558/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1559/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1560/// elements of a 256-bit vector of [4 x i64].
1561///
1562/// \code{.operation}
1563/// result[63:0] := ZeroExtend(__V[15:0])
1564/// result[127:64] := ZeroExtend(__V[31:16])
1565/// result[191:128] := ZeroExtend(__V[47:32])
1566/// result[255:192] := ZeroExtend(__V[64:48])
1567/// \endcode
1568///
1569/// \headerfile <immintrin.h>
1570///
1571/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1572///
1573/// \param __V
1574/// A 128-bit vector of [8 x i16] containing the source values.
1575/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1576/// values.
1577static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1579 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1580}
1581
1582/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1583/// \a __V and returns the 64-bit values in the corresponding elements of a
1584/// 256-bit vector of [4 x i64].
1585///
1586/// \code{.operation}
1587/// result[63:0] := ZeroExtend(__V[31:0])
1588/// result[127:64] := ZeroExtend(__V[63:32])
1589/// result[191:128] := ZeroExtend(__V[95:64])
1590/// result[255:192] := ZeroExtend(__V[127:96])
1591/// \endcode
1592///
1593/// \headerfile <immintrin.h>
1594///
1595/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1596///
1597/// \param __V
1598/// A 128-bit vector of [4 x i32] containing the source values.
1599/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1600/// values.
1601static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1603 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1604}
1605
1606/// Multiplies signed 32-bit integers from even-numbered elements of two
1607/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1608/// [4 x i64] result.
1609///
1610/// \code{.operation}
1611/// result[63:0] := __a[31:0] * __b[31:0]
1612/// result[127:64] := __a[95:64] * __b[95:64]
1613/// result[191:128] := __a[159:128] * __b[159:128]
1614/// result[255:192] := __a[223:192] * __b[223:192]
1615/// \endcode
1616///
1617/// \headerfile <immintrin.h>
1618///
1619/// This intrinsic corresponds to the \c VPMULDQ instruction.
1620///
1621/// \param __a
1622/// A 256-bit vector of [8 x i32] containing one of the source operands.
1623/// \param __b
1624/// A 256-bit vector of [8 x i32] containing one of the source operands.
1625/// \returns A 256-bit vector of [4 x i64] containing the products.
1626static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1627_mm256_mul_epi32(__m256i __a, __m256i __b) {
1628 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1629}
1630
1631/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1632/// [16 x i16], truncates the 32-bit results to the most significant 18
1633/// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1634/// product in the [16 x i16] result.
1635///
1636/// \code{.operation}
1637/// FOR i := 0 TO 15
1638/// j := i*16
1639/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1640/// result[j+15:j] := temp[16:1]
1641/// \endcode
1642///
1643/// \headerfile <immintrin.h>
1644///
1645/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1646///
1647/// \param __a
1648/// A 256-bit vector of [16 x i16] containing one of the source operands.
1649/// \param __b
1650/// A 256-bit vector of [16 x i16] containing one of the source operands.
1651/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1652static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1653_mm256_mulhrs_epi16(__m256i __a, __m256i __b) {
1654 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1655}
1656
1657/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1658/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1659/// [16 x i16] result.
1660///
1661/// \headerfile <immintrin.h>
1662///
1663/// This intrinsic corresponds to the \c VPMULHUW instruction.
1664///
1665/// \param __a
1666/// A 256-bit vector of [16 x i16] containing one of the source operands.
1667/// \param __b
1668/// A 256-bit vector of [16 x i16] containing one of the source operands.
1669/// \returns A 256-bit vector of [16 x i16] containing the products.
1670static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1671_mm256_mulhi_epu16(__m256i __a, __m256i __b) {
1672 return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
1673}
1674
1675/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1677/// [16 x i16] result.
1678///
1679/// \headerfile <immintrin.h>
1680///
1681/// This intrinsic corresponds to the \c VPMULHW instruction.
1682///
1683/// \param __a
1684/// A 256-bit vector of [16 x i16] containing one of the source operands.
1685/// \param __b
1686/// A 256-bit vector of [16 x i16] containing one of the source operands.
1687/// \returns A 256-bit vector of [16 x i16] containing the products.
1688static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1689_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1690{
1691 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1692}
1693
1694/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1695/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1696/// [16 x i16] result.
1697///
1698/// \headerfile <immintrin.h>
1699///
1700/// This intrinsic corresponds to the \c VPMULLW instruction.
1701///
1702/// \param __a
1703/// A 256-bit vector of [16 x i16] containing one of the source operands.
1704/// \param __b
1705/// A 256-bit vector of [16 x i16] containing one of the source operands.
1706/// \returns A 256-bit vector of [16 x i16] containing the products.
1707static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1708_mm256_mullo_epi16(__m256i __a, __m256i __b)
1709{
1710 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1711}
1712
1713/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1714/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1715/// [8 x i32] result.
1716///
1717/// \headerfile <immintrin.h>
1718///
1719/// This intrinsic corresponds to the \c VPMULLD instruction.
1720///
1721/// \param __a
1722/// A 256-bit vector of [8 x i32] containing one of the source operands.
1723/// \param __b
1724/// A 256-bit vector of [8 x i32] containing one of the source operands.
1725/// \returns A 256-bit vector of [8 x i32] containing the products.
1726static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1727_mm256_mullo_epi32(__m256i __a, __m256i __b) {
1728 return (__m256i)((__v8su)__a * (__v8su)__b);
1729}
1730
1731/// Multiplies unsigned 32-bit integers from even-numered elements of two
1732/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1733/// [4 x i64] result.
1734///
1735/// \code{.operation}
1736/// result[63:0] := __a[31:0] * __b[31:0]
1737/// result[127:64] := __a[95:64] * __b[95:64]
1738/// result[191:128] := __a[159:128] * __b[159:128]
1739/// result[255:192] := __a[223:192] * __b[223:192]
1740/// \endcode
1741///
1742/// \headerfile <immintrin.h>
1743///
1744/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1745///
1746/// \param __a
1747/// A 256-bit vector of [8 x i32] containing one of the source operands.
1748/// \param __b
1749/// A 256-bit vector of [8 x i32] containing one of the source operands.
1750/// \returns A 256-bit vector of [4 x i64] containing the products.
1751static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1752_mm256_mul_epu32(__m256i __a, __m256i __b) {
1753 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1754}
1755
1756/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1757/// \a __b.
1758///
1759/// \headerfile <immintrin.h>
1760///
1761/// This intrinsic corresponds to the \c VPOR instruction.
1762///
1763/// \param __a
1764/// A 256-bit integer vector.
1765/// \param __b
1766/// A 256-bit integer vector.
1767/// \returns A 256-bit integer vector containing the result.
1768static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1769_mm256_or_si256(__m256i __a, __m256i __b)
1770{
1771 return (__m256i)((__v4du)__a | (__v4du)__b);
1772}
1773
1774/// Computes four sum of absolute difference (SAD) operations on sets of eight
1775/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1776/// \a __b.
1777///
1778/// One SAD result is computed for each set of eight bytes from \a __a and
1779/// eight bytes from \a __b. The zero-extended SAD value is returned in the
1780/// corresponding 64-bit element of the result.
1781///
1782/// A single SAD operation takes the differences between the corresponding
1783/// bytes of \a __a and \a __b, takes the absolute value of each difference,
1784/// and sums these eight values to form one 16-bit result. This operation
1785/// is repeated four times with successive sets of eight bytes.
1786///
1787/// \code{.operation}
1788/// FOR i := 0 TO 3
1789/// j := i*64
1790/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1791/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1792/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1793/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1794/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1795/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1796/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1797/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1798/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1799/// temp4 + temp5 + temp6 + temp7
1800/// result[j+63:j+16] := 0
1801/// ENDFOR
1802/// \endcode
1803///
1804/// \headerfile <immintrin.h>
1805///
1806/// This intrinsic corresponds to the \c VPSADBW instruction.
1807///
1808/// \param __a
1809/// A 256-bit integer vector.
1810/// \param __b
1811/// A 256-bit integer vector.
1812/// \returns A 256-bit integer vector containing the result.
1813static __inline__ __m256i __DEFAULT_FN_ATTRS256
1814_mm256_sad_epu8(__m256i __a, __m256i __b)
1815{
1816 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1817}
1818
1819/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1820/// to control information in the 256-bit integer vector \a __b, and
1821/// returns the 256-bit result. In effect there are two separate 128-bit
1822/// shuffles in the lower and upper halves.
1823///
1824/// \code{.operation}
1825/// FOR i := 0 TO 31
1826/// j := i*8
1827/// IF __b[j+7] == 1
1828/// result[j+7:j] := 0
1829/// ELSE
1830/// k := __b[j+3:j] * 8
1831/// IF i > 15
1832/// k := k + 128
1833/// FI
1834/// result[j+7:j] := __a[k+7:k]
1835/// FI
1836/// ENDFOR
1837/// \endcode
1838///
1839/// \headerfile <immintrin.h>
1840///
1841/// This intrinsic corresponds to the \c VPSHUFB instruction.
1842///
1843/// \param __a
1844/// A 256-bit integer vector containing source values.
1845/// \param __b
1846/// A 256-bit integer vector containing control information to determine
1847/// what goes into the corresponding byte of the result. If bit 7 of the
1848/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1849/// control byte specify the index (within the same 128-bit half) of \a __a
1850/// to copy to the result byte.
1851/// \returns A 256-bit integer vector containing the result.
1852static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1853_mm256_shuffle_epi8(__m256i __a, __m256i __b) {
1854 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1855}
1856
1857/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1858/// according to control information in the integer literal \a imm, and
1859/// returns the 256-bit result. In effect there are two parallel 128-bit
1860/// shuffles in the lower and upper halves.
1861///
1862/// \code{.operation}
1863/// FOR i := 0 to 3
1864/// j := i*32
1865/// k := (imm >> i*2)[1:0] * 32
1866/// result[j+31:j] := a[k+31:k]
1867/// result[128+j+31:128+j] := a[128+k+31:128+k]
1868/// ENDFOR
1869/// \endcode
1870///
1871/// \headerfile <immintrin.h>
1872///
1873/// \code
1874/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1875/// \endcode
1876///
1877/// This intrinsic corresponds to the \c VPSHUFB instruction.
1878///
1879/// \param a
1880/// A 256-bit vector of [8 x i32] containing source values.
1881/// \param imm
1882/// An immediate 8-bit value specifying which elements to copy from \a a.
1883/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1884/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1885/// forth.
1886/// \returns A 256-bit vector of [8 x i32] containing the result.
1887#define _mm256_shuffle_epi32(a, imm) \
1888 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1889
1890/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1891/// according to control information in the integer literal \a imm, and
1892/// returns the 256-bit result. The upper 64 bits of each 128-bit half
1893/// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1894/// copied from \a a unchanged.
1895///
1896/// \code{.operation}
1897/// result[63:0] := a[63:0]
1898/// result[191:128] := a[191:128]
1899/// FOR i := 0 TO 3
1900/// j := i * 16 + 64
1901/// k := (imm >> i*2)[1:0] * 16 + 64
1902/// result[j+15:j] := a[k+15:k]
1903/// result[128+j+15:128+j] := a[128+k+15:128+k]
1904/// ENDFOR
1905/// \endcode
1906///
1907/// \headerfile <immintrin.h>
1908///
1909/// \code
1910/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1911/// \endcode
1912///
1913/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1914///
1915/// \param a
1916/// A 256-bit vector of [16 x i16] containing source values.
1917/// \param imm
1918/// An immediate 8-bit value specifying which elements to copy from \a a.
1919/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1920/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1921/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1922/// \returns A 256-bit vector of [16 x i16] containing the result.
1923#define _mm256_shufflehi_epi16(a, imm) \
1924 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1925
1926/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1927/// according to control information in the integer literal \a imm, and
1928/// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1929/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1930/// copied from \a a unchanged.
1931///
1932/// \code{.operation}
1933/// result[127:64] := a[127:64]
1934/// result[255:192] := a[255:192]
1935/// FOR i := 0 TO 3
1936/// j := i * 16
1937/// k := (imm >> i*2)[1:0] * 16
1938/// result[j+15:j] := a[k+15:k]
1939/// result[128+j+15:128+j] := a[128+k+15:128+k]
1940/// ENDFOR
1941/// \endcode
1942///
1943/// \headerfile <immintrin.h>
1944///
1945/// \code
1946/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1947/// \endcode
1948///
1949/// This intrinsic corresponds to the \c VPSHUFLW instruction.
1950///
1951/// \param a
1952/// A 256-bit vector of [16 x i16] to use as a source of data for the
1953/// result.
1954/// \param imm
1955/// An immediate 8-bit value specifying which elements to copy from \a a.
1956/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
1957/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
1958/// forth.
1959/// \returns A 256-bit vector of [16 x i16] containing the result.
1960#define _mm256_shufflelo_epi16(a, imm) \
1961 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
1962
1963/// Sets each byte of the result to the corresponding byte of the 256-bit
1964/// integer vector in \a __a, the negative of that byte, or zero, depending
1965/// on whether the corresponding byte of the 256-bit integer vector in
1966/// \a __b is greater than zero, less than zero, or equal to zero,
1967/// respectively.
1968///
1969/// \headerfile <immintrin.h>
1970///
1971/// This intrinsic corresponds to the \c VPSIGNB instruction.
1972///
1973/// \param __a
1974/// A 256-bit integer vector.
1975/// \param __b
1976/// A 256-bit integer vector].
1977/// \returns A 256-bit integer vector containing the result.
1978static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1979_mm256_sign_epi8(__m256i __a, __m256i __b) {
1980 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
1981}
1982
1983/// Sets each element of the result to the corresponding element of the
1984/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
1985/// or zero, depending on whether the corresponding element of the 256-bit
1986/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
1987/// equal to zero, respectively.
1988///
1989/// \headerfile <immintrin.h>
1990///
1991/// This intrinsic corresponds to the \c VPSIGNW instruction.
1992///
1993/// \param __a
1994/// A 256-bit vector of [16 x i16].
1995/// \param __b
1996/// A 256-bit vector of [16 x i16].
1997/// \returns A 256-bit vector of [16 x i16] containing the result.
1998static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1999_mm256_sign_epi16(__m256i __a, __m256i __b) {
2000 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2001}
2002
2003/// Sets each element of the result to the corresponding element of the
2004/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2005/// zero, depending on whether the corresponding element of the 256-bit
2006/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2007/// equal to zero, respectively.
2008///
2009/// \headerfile <immintrin.h>
2010///
2011/// This intrinsic corresponds to the \c VPSIGND instruction.
2012///
2013/// \param __a
2014/// A 256-bit vector of [8 x i32].
2015/// \param __b
2016/// A 256-bit vector of [8 x i32].
2017/// \returns A 256-bit vector of [8 x i32] containing the result.
2018static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2019_mm256_sign_epi32(__m256i __a, __m256i __b) {
2020 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2021}
2022
2023/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2024/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2025/// is greater than 15, the returned result is all zeroes.
2026///
2027/// \headerfile <immintrin.h>
2028///
2029/// \code
2030/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2031/// \endcode
2032///
2033/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2034///
2035/// \param a
2036/// A 256-bit integer vector to be shifted.
2037/// \param imm
2038/// An unsigned immediate value specifying the shift count (in bytes).
2039/// \returns A 256-bit integer vector containing the result.
2040#define _mm256_slli_si256(a, imm) \
2041 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2042 (int)(imm)))
2043
2044/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2045/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2046/// is greater than 15, the returned result is all zeroes.
2047///
2048/// \headerfile <immintrin.h>
2049///
2050/// \code
2051/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2052/// \endcode
2053///
2054/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2055///
2056/// \param a
2057/// A 256-bit integer vector to be shifted.
2058/// \param imm
2059/// An unsigned immediate value specifying the shift count (in bytes).
2060/// \returns A 256-bit integer vector containing the result.
2061#define _mm256_bslli_epi128(a, imm) \
2062 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2063 (int)(imm)))
2064
2065/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2066/// left by \a __count bits, shifting in zero bits, and returns the result.
2067/// If \a __count is greater than 15, the returned result is all zeroes.
2068///
2069/// \headerfile <immintrin.h>
2070///
2071/// This intrinsic corresponds to the \c VPSLLW instruction.
2072///
2073/// \param __a
2074/// A 256-bit vector of [16 x i16] to be shifted.
2075/// \param __count
2076/// An unsigned integer value specifying the shift count (in bits).
2077/// \returns A 256-bit vector of [16 x i16] containing the result.
2078static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2079_mm256_slli_epi16(__m256i __a, int __count) {
2080 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2081}
2082
2083/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2084/// left by the number of bits specified by the lower 64 bits of \a __count,
2085/// shifting in zero bits, and returns the result. If \a __count is greater
2086/// than 15, the returned result is all zeroes.
2087///
2088/// \headerfile <immintrin.h>
2089///
2090/// This intrinsic corresponds to the \c VPSLLW instruction.
2091///
2092/// \param __a
2093/// A 256-bit vector of [16 x i16] to be shifted.
2094/// \param __count
2095/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2096/// shift count (in bits). The upper element is ignored.
2097/// \returns A 256-bit vector of [16 x i16] containing the result.
2098static __inline__ __m256i __DEFAULT_FN_ATTRS256
2099_mm256_sll_epi16(__m256i __a, __m128i __count)
2100{
2101 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2102}
2103
2104/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2105/// left by \a __count bits, shifting in zero bits, and returns the result.
2106/// If \a __count is greater than 31, the returned result is all zeroes.
2107///
2108/// \headerfile <immintrin.h>
2109///
2110/// This intrinsic corresponds to the \c VPSLLD instruction.
2111///
2112/// \param __a
2113/// A 256-bit vector of [8 x i32] to be shifted.
2114/// \param __count
2115/// An unsigned integer value specifying the shift count (in bits).
2116/// \returns A 256-bit vector of [8 x i32] containing the result.
2117static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2118_mm256_slli_epi32(__m256i __a, int __count) {
2119 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2120}
2121
2122/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2123/// left by the number of bits given in the lower 64 bits of \a __count,
2124/// shifting in zero bits, and returns the result. If \a __count is greater
2125/// than 31, the returned result is all zeroes.
2126///
2127/// \headerfile <immintrin.h>
2128///
2129/// This intrinsic corresponds to the \c VPSLLD instruction.
2130///
2131/// \param __a
2132/// A 256-bit vector of [8 x i32] to be shifted.
2133/// \param __count
2134/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2135/// shift count (in bits). The upper element is ignored.
2136/// \returns A 256-bit vector of [8 x i32] containing the result.
2137static __inline__ __m256i __DEFAULT_FN_ATTRS256
2138_mm256_sll_epi32(__m256i __a, __m128i __count)
2139{
2140 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2141}
2142
2143/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2144/// left by \a __count bits, shifting in zero bits, and returns the result.
2145/// If \a __count is greater than 63, the returned result is all zeroes.
2146///
2147/// \headerfile <immintrin.h>
2148///
2149/// This intrinsic corresponds to the \c VPSLLQ instruction.
2150///
2151/// \param __a
2152/// A 256-bit vector of [4 x i64] to be shifted.
2153/// \param __count
2154/// An unsigned integer value specifying the shift count (in bits).
2155/// \returns A 256-bit vector of [4 x i64] containing the result.
2156static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2157_mm256_slli_epi64(__m256i __a, int __count) {
2158 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2159}
2160
2161/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2162/// left by the number of bits given in the lower 64 bits of \a __count,
2163/// shifting in zero bits, and returns the result. If \a __count is greater
2164/// than 63, the returned result is all zeroes.
2165///
2166/// \headerfile <immintrin.h>
2167///
2168/// This intrinsic corresponds to the \c VPSLLQ instruction.
2169///
2170/// \param __a
2171/// A 256-bit vector of [4 x i64] to be shifted.
2172/// \param __count
2173/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2174/// shift count (in bits). The upper element is ignored.
2175/// \returns A 256-bit vector of [4 x i64] containing the result.
2176static __inline__ __m256i __DEFAULT_FN_ATTRS256
2177_mm256_sll_epi64(__m256i __a, __m128i __count)
2178{
2179 return __builtin_ia32_psllq256((__v4di)__a, __count);
2180}
2181
2182/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2183/// right by \a __count bits, shifting in sign bits, and returns the result.
2184/// If \a __count is greater than 15, each element of the result is either
2185/// 0 or -1 according to the corresponding input sign bit.
2186///
2187/// \headerfile <immintrin.h>
2188///
2189/// This intrinsic corresponds to the \c VPSRAW instruction.
2190///
2191/// \param __a
2192/// A 256-bit vector of [16 x i16] to be shifted.
2193/// \param __count
2194/// An unsigned integer value specifying the shift count (in bits).
2195/// \returns A 256-bit vector of [16 x i16] containing the result.
2196static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2197_mm256_srai_epi16(__m256i __a, int __count) {
2198 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2199}
2200
2201/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2202/// right by the number of bits given in the lower 64 bits of \a __count,
2203/// shifting in sign bits, and returns the result. If \a __count is greater
2204/// than 15, each element of the result is either 0 or -1 according to the
2205/// corresponding input sign bit.
2206///
2207/// \headerfile <immintrin.h>
2208///
2209/// This intrinsic corresponds to the \c VPSRAW instruction.
2210///
2211/// \param __a
2212/// A 256-bit vector of [16 x i16] to be shifted.
2213/// \param __count
2214/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2215/// shift count (in bits). The upper element is ignored.
2216/// \returns A 256-bit vector of [16 x i16] containing the result.
2217static __inline__ __m256i __DEFAULT_FN_ATTRS256
2218_mm256_sra_epi16(__m256i __a, __m128i __count)
2219{
2220 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2221}
2222
2223/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2224/// right by \a __count bits, shifting in sign bits, and returns the result.
2225/// If \a __count is greater than 31, each element of the result is either
2226/// 0 or -1 according to the corresponding input sign bit.
2227///
2228/// \headerfile <immintrin.h>
2229///
2230/// This intrinsic corresponds to the \c VPSRAD instruction.
2231///
2232/// \param __a
2233/// A 256-bit vector of [8 x i32] to be shifted.
2234/// \param __count
2235/// An unsigned integer value specifying the shift count (in bits).
2236/// \returns A 256-bit vector of [8 x i32] containing the result.
2237static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2238_mm256_srai_epi32(__m256i __a, int __count) {
2239 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2240}
2241
2242/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2243/// right by the number of bits given in the lower 64 bits of \a __count,
2244/// shifting in sign bits, and returns the result. If \a __count is greater
2245/// than 31, each element of the result is either 0 or -1 according to the
2246/// corresponding input sign bit.
2247///
2248/// \headerfile <immintrin.h>
2249///
2250/// This intrinsic corresponds to the \c VPSRAD instruction.
2251///
2252/// \param __a
2253/// A 256-bit vector of [8 x i32] to be shifted.
2254/// \param __count
2255/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2256/// shift count (in bits). The upper element is ignored.
2257/// \returns A 256-bit vector of [8 x i32] containing the result.
2258static __inline__ __m256i __DEFAULT_FN_ATTRS256
2259_mm256_sra_epi32(__m256i __a, __m128i __count)
2260{
2261 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2262}
2263
2264/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2265/// \a imm bytes, shifting in zero bytes, and returns the result. If
2266/// \a imm is greater than 15, the returned result is all zeroes.
2267///
2268/// \headerfile <immintrin.h>
2269///
2270/// \code
2271/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2272/// \endcode
2273///
2274/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2275///
2276/// \param a
2277/// A 256-bit integer vector to be shifted.
2278/// \param imm
2279/// An unsigned immediate value specifying the shift count (in bytes).
2280/// \returns A 256-bit integer vector containing the result.
2281#define _mm256_srli_si256(a, imm) \
2282 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2283 (int)(imm)))
2284
2285/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2286/// \a imm bytes, shifting in zero bytes, and returns the result. If
2287/// \a imm is greater than 15, the returned result is all zeroes.
2288///
2289/// \headerfile <immintrin.h>
2290///
2291/// \code
2292/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2293/// \endcode
2294///
2295/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2296///
2297/// \param a
2298/// A 256-bit integer vector to be shifted.
2299/// \param imm
2300/// An unsigned immediate value specifying the shift count (in bytes).
2301/// \returns A 256-bit integer vector containing the result.
2302#define _mm256_bsrli_epi128(a, imm) \
2303 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2304 (int)(imm)))
2305
2306/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2307/// right by \a __count bits, shifting in zero bits, and returns the result.
2308/// If \a __count is greater than 15, the returned result is all zeroes.
2309///
2310/// \headerfile <immintrin.h>
2311///
2312/// This intrinsic corresponds to the \c VPSRLW instruction.
2313///
2314/// \param __a
2315/// A 256-bit vector of [16 x i16] to be shifted.
2316/// \param __count
2317/// An unsigned integer value specifying the shift count (in bits).
2318/// \returns A 256-bit vector of [16 x i16] containing the result.
2319static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2320_mm256_srli_epi16(__m256i __a, int __count) {
2321 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2322}
2323
2324/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2325/// right by the number of bits given in the lower 64 bits of \a __count,
2326/// shifting in zero bits, and returns the result. If \a __count is greater
2327/// than 15, the returned result is all zeroes.
2328///
2329/// \headerfile <immintrin.h>
2330///
2331/// This intrinsic corresponds to the \c VPSRLW instruction.
2332///
2333/// \param __a
2334/// A 256-bit vector of [16 x i16] to be shifted.
2335/// \param __count
2336/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2337/// shift count (in bits). The upper element is ignored.
2338/// \returns A 256-bit vector of [16 x i16] containing the result.
2339static __inline__ __m256i __DEFAULT_FN_ATTRS256
2340_mm256_srl_epi16(__m256i __a, __m128i __count)
2341{
2342 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2343}
2344
2345/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2346/// right by \a __count bits, shifting in zero bits, and returns the result.
2347/// If \a __count is greater than 31, the returned result is all zeroes.
2348///
2349/// \headerfile <immintrin.h>
2350///
2351/// This intrinsic corresponds to the \c VPSRLD instruction.
2352///
2353/// \param __a
2354/// A 256-bit vector of [8 x i32] to be shifted.
2355/// \param __count
2356/// An unsigned integer value specifying the shift count (in bits).
2357/// \returns A 256-bit vector of [8 x i32] containing the result.
2358static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2359_mm256_srli_epi32(__m256i __a, int __count) {
2360 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2361}
2362
2363/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2364/// right by the number of bits given in the lower 64 bits of \a __count,
2365/// shifting in zero bits, and returns the result. If \a __count is greater
2366/// than 31, the returned result is all zeroes.
2367///
2368/// \headerfile <immintrin.h>
2369///
2370/// This intrinsic corresponds to the \c VPSRLD instruction.
2371///
2372/// \param __a
2373/// A 256-bit vector of [8 x i32] to be shifted.
2374/// \param __count
2375/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2376/// shift count (in bits). The upper element is ignored.
2377/// \returns A 256-bit vector of [8 x i32] containing the result.
2378static __inline__ __m256i __DEFAULT_FN_ATTRS256
2379_mm256_srl_epi32(__m256i __a, __m128i __count)
2380{
2381 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2382}
2383
2384/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2385/// right by \a __count bits, shifting in zero bits, and returns the result.
2386/// If \a __count is greater than 63, the returned result is all zeroes.
2387///
2388/// \headerfile <immintrin.h>
2389///
2390/// This intrinsic corresponds to the \c VPSRLQ instruction.
2391///
2392/// \param __a
2393/// A 256-bit vector of [4 x i64] to be shifted.
2394/// \param __count
2395/// An unsigned integer value specifying the shift count (in bits).
2396/// \returns A 256-bit vector of [4 x i64] containing the result.
2397static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2398_mm256_srli_epi64(__m256i __a, int __count) {
2399 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2400}
2401
2402/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2403/// right by the number of bits given in the lower 64 bits of \a __count,
2404/// shifting in zero bits, and returns the result. If \a __count is greater
2405/// than 63, the returned result is all zeroes.
2406///
2407/// \headerfile <immintrin.h>
2408///
2409/// This intrinsic corresponds to the \c VPSRLQ instruction.
2410///
2411/// \param __a
2412/// A 256-bit vector of [4 x i64] to be shifted.
2413/// \param __count
2414/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2415/// shift count (in bits). The upper element is ignored.
2416/// \returns A 256-bit vector of [4 x i64] containing the result.
2417static __inline__ __m256i __DEFAULT_FN_ATTRS256
2418_mm256_srl_epi64(__m256i __a, __m128i __count)
2419{
2420 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2421}
2422
2423/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2424/// vectors. Returns the lower 8 bits of each difference in the
2425/// corresponding byte of the 256-bit integer vector result (overflow is
2426/// ignored).
2427///
2428/// \code{.operation}
2429/// FOR i := 0 TO 31
2430/// j := i*8
2431/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2432/// ENDFOR
2433/// \endcode
2434///
2435/// \headerfile <immintrin.h>
2436///
2437/// This intrinsic corresponds to the \c VPSUBB instruction.
2438///
2439/// \param __a
2440/// A 256-bit integer vector containing the minuends.
2441/// \param __b
2442/// A 256-bit integer vector containing the subtrahends.
2443/// \returns A 256-bit integer vector containing the differences.
2444static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2445_mm256_sub_epi8(__m256i __a, __m256i __b) {
2446 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2447}
2448
2449/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2450/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2451/// the corresponding element of the [16 x i16] result (overflow is
2452/// ignored).
2453///
2454/// \code{.operation}
2455/// FOR i := 0 TO 15
2456/// j := i*16
2457/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2458/// ENDFOR
2459/// \endcode
2460///
2461/// \headerfile <immintrin.h>
2462///
2463/// This intrinsic corresponds to the \c VPSUBW instruction.
2464///
2465/// \param __a
2466/// A 256-bit vector of [16 x i16] containing the minuends.
2467/// \param __b
2468/// A 256-bit vector of [16 x i16] containing the subtrahends.
2469/// \returns A 256-bit vector of [16 x i16] containing the differences.
2470static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2471_mm256_sub_epi16(__m256i __a, __m256i __b) {
2472 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2473}
2474
2475/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2476/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2477/// the corresponding element of the [8 x i32] result (overflow is ignored).
2478///
2479/// \code{.operation}
2480/// FOR i := 0 TO 7
2481/// j := i*32
2482/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2483/// ENDFOR
2484/// \endcode
2485///
2486/// \headerfile <immintrin.h>
2487///
2488/// This intrinsic corresponds to the \c VPSUBD instruction.
2489///
2490/// \param __a
2491/// A 256-bit vector of [8 x i32] containing the minuends.
2492/// \param __b
2493/// A 256-bit vector of [8 x i32] containing the subtrahends.
2494/// \returns A 256-bit vector of [8 x i32] containing the differences.
2495static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2496_mm256_sub_epi32(__m256i __a, __m256i __b) {
2497 return (__m256i)((__v8su)__a - (__v8su)__b);
2498}
2499
2500/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2501/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2502/// the corresponding element of the [4 x i64] result (overflow is ignored).
2503///
2504/// \code{.operation}
2505/// FOR i := 0 TO 3
2506/// j := i*64
2507/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2508/// ENDFOR
2509/// \endcode
2510///
2511/// \headerfile <immintrin.h>
2512///
2513/// This intrinsic corresponds to the \c VPSUBQ instruction.
2514///
2515/// \param __a
2516/// A 256-bit vector of [4 x i64] containing the minuends.
2517/// \param __b
2518/// A 256-bit vector of [4 x i64] containing the subtrahends.
2519/// \returns A 256-bit vector of [4 x i64] containing the differences.
2520static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2521_mm256_sub_epi64(__m256i __a, __m256i __b) {
2522 return (__m256i)((__v4du)__a - (__v4du)__b);
2523}
2524
2525/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2526/// vectors using signed saturation, and returns each differences in the
2527/// corresponding byte of the 256-bit integer vector result.
2528///
2529/// \code{.operation}
2530/// FOR i := 0 TO 31
2531/// j := i*8
2532/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2533/// ENDFOR
2534/// \endcode
2535///
2536/// \headerfile <immintrin.h>
2537///
2538/// This intrinsic corresponds to the \c VPSUBSB instruction.
2539///
2540/// \param __a
2541/// A 256-bit integer vector containing the minuends.
2542/// \param __b
2543/// A 256-bit integer vector containing the subtrahends.
2544/// \returns A 256-bit integer vector containing the differences.
2545static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2546_mm256_subs_epi8(__m256i __a, __m256i __b) {
2547 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2548}
2549
2550/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2551/// vectors of [16 x i16] using signed saturation, and returns each
2552/// difference in the corresponding element of the [16 x i16] result.
2553///
2554/// \code{.operation}
2555/// FOR i := 0 TO 15
2556/// j := i*16
2557/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2558/// ENDFOR
2559/// \endcode
2560///
2561/// \headerfile <immintrin.h>
2562///
2563/// This intrinsic corresponds to the \c VPSUBSW instruction.
2564///
2565/// \param __a
2566/// A 256-bit vector of [16 x i16] containing the minuends.
2567/// \param __b
2568/// A 256-bit vector of [16 x i16] containing the subtrahends.
2569/// \returns A 256-bit vector of [16 x i16] containing the differences.
2570static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2571_mm256_subs_epi16(__m256i __a, __m256i __b) {
2572 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2573}
2574
2575/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2576/// vectors using unsigned saturation, and returns each difference in the
2577/// corresponding byte of the 256-bit integer vector result. For each byte,
2578/// computes <c> result = __a - __b </c>.
2579///
2580/// \code{.operation}
2581/// FOR i := 0 TO 31
2582/// j := i*8
2583/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2584/// ENDFOR
2585/// \endcode
2586///
2587/// \headerfile <immintrin.h>
2588///
2589/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2590///
2591/// \param __a
2592/// A 256-bit integer vector containing the minuends.
2593/// \param __b
2594/// A 256-bit integer vector containing the subtrahends.
2595/// \returns A 256-bit integer vector containing the differences.
2596static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2597_mm256_subs_epu8(__m256i __a, __m256i __b) {
2598 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2599}
2600
2601/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2602/// vectors of [16 x i16] using unsigned saturation, and returns each
2603/// difference in the corresponding element of the [16 x i16] result.
2604///
2605/// \code{.operation}
2606/// FOR i := 0 TO 15
2607/// j := i*16
2608/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2609/// ENDFOR
2610/// \endcode
2611///
2612/// \headerfile <immintrin.h>
2613///
2614/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2615///
2616/// \param __a
2617/// A 256-bit vector of [16 x i16] containing the minuends.
2618/// \param __b
2619/// A 256-bit vector of [16 x i16] containing the subtrahends.
2620/// \returns A 256-bit vector of [16 x i16] containing the differences.
2621static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2622_mm256_subs_epu16(__m256i __a, __m256i __b) {
2623 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2624}
2625
2626/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2627/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2628/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2629/// input; other bits in these parameters are ignored.
2630///
2631/// \code{.operation}
2632/// result[7:0] := __a[71:64]
2633/// result[15:8] := __b[71:64]
2634/// result[23:16] := __a[79:72]
2635/// result[31:24] := __b[79:72]
2636/// . . .
2637/// result[127:120] := __b[127:120]
2638/// result[135:128] := __a[199:192]
2639/// . . .
2640/// result[255:248] := __b[255:248]
2641/// \endcode
2642///
2643/// \headerfile <immintrin.h>
2644///
2645/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2646///
2647/// \param __a
2648/// A 256-bit integer vector used as the source for the even-numbered bytes
2649/// of the result.
2650/// \param __b
2651/// A 256-bit integer vector used as the source for the odd-numbered bytes
2652/// of the result.
2653/// \returns A 256-bit integer vector containing the result.
2654static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2655_mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
2656 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2657}
2658
2659/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2660/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2661/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2662/// 128-bit half of \a __a and \a __b as input; other bits in these
2663/// parameters are ignored.
2664///
2665/// \code{.operation}
2666/// result[15:0] := __a[79:64]
2667/// result[31:16] := __b[79:64]
2668/// result[47:32] := __a[95:80]
2669/// result[63:48] := __b[95:80]
2670/// . . .
2671/// result[127:112] := __b[127:112]
2672/// result[143:128] := __a[211:196]
2673/// . . .
2674/// result[255:240] := __b[255:240]
2675/// \endcode
2676///
2677/// \headerfile <immintrin.h>
2678///
2679/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2680///
2681/// \param __a
2682/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2683/// elements of the result.
2684/// \param __b
2685/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2686/// elements of the result.
2687/// \returns A 256-bit vector of [16 x i16] containing the result.
2688static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2689_mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
2690 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2691}
2692
2693/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2694/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2695/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2696/// of \a __a and \a __b as input; other bits in these parameters are
2697/// ignored.
2698///
2699/// \code{.operation}
2700/// result[31:0] := __a[95:64]
2701/// result[63:32] := __b[95:64]
2702/// result[95:64] := __a[127:96]
2703/// result[127:96] := __b[127:96]
2704/// result[159:128] := __a[223:192]
2705/// result[191:160] := __b[223:192]
2706/// result[223:192] := __a[255:224]
2707/// result[255:224] := __b[255:224]
2708/// \endcode
2709///
2710/// \headerfile <immintrin.h>
2711///
2712/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2713///
2714/// \param __a
2715/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2716/// elements of the result.
2717/// \param __b
2718/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2719/// elements of the result.
2720/// \returns A 256-bit vector of [8 x i32] containing the result.
2721static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2722_mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
2723 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2724}
2725
2726/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2727/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2728/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2729/// of \a __a and \a __b as input; other bits in these parameters are
2730/// ignored.
2731///
2732/// \code{.operation}
2733/// result[63:0] := __a[127:64]
2734/// result[127:64] := __b[127:64]
2735/// result[191:128] := __a[255:192]
2736/// result[255:192] := __b[255:192]
2737/// \endcode
2738///
2739/// \headerfile <immintrin.h>
2740///
2741/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2742///
2743/// \param __a
2744/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2745/// elements of the result.
2746/// \param __b
2747/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2748/// elements of the result.
2749/// \returns A 256-bit vector of [4 x i64] containing the result.
2750static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2751_mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
2752 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2753}
2754
2755/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2756/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2757/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2758/// input; other bits in these parameters are ignored.
2759///
2760/// \code{.operation}
2761/// result[7:0] := __a[7:0]
2762/// result[15:8] := __b[7:0]
2763/// result[23:16] := __a[15:8]
2764/// result[31:24] := __b[15:8]
2765/// . . .
2766/// result[127:120] := __b[63:56]
2767/// result[135:128] := __a[135:128]
2768/// . . .
2769/// result[255:248] := __b[191:184]
2770/// \endcode
2771///
2772/// \headerfile <immintrin.h>
2773///
2774/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2775///
2776/// \param __a
2777/// A 256-bit integer vector used as the source for the even-numbered bytes
2778/// of the result.
2779/// \param __b
2780/// A 256-bit integer vector used as the source for the odd-numbered bytes
2781/// of the result.
2782/// \returns A 256-bit integer vector containing the result.
2783static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2784_mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
2785 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2786}
2787
2788/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2789/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2790/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2791/// 128-bit half of \a __a and \a __b as input; other bits in these
2792/// parameters are ignored.
2793///
2794/// \code{.operation}
2795/// result[15:0] := __a[15:0]
2796/// result[31:16] := __b[15:0]
2797/// result[47:32] := __a[31:16]
2798/// result[63:48] := __b[31:16]
2799/// . . .
2800/// result[127:112] := __b[63:48]
2801/// result[143:128] := __a[143:128]
2802/// . . .
2803/// result[255:239] := __b[191:176]
2804/// \endcode
2805///
2806/// \headerfile <immintrin.h>
2807///
2808/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2809///
2810/// \param __a
2811/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2812/// elements of the result.
2813/// \param __b
2814/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2815/// elements of the result.
2816/// \returns A 256-bit vector of [16 x i16] containing the result.
2817static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2818_mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
2819 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2820}
2821
2822/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2823/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2824/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2825/// of \a __a and \a __b as input; other bits in these parameters are
2826/// ignored.
2827///
2828/// \code{.operation}
2829/// result[31:0] := __a[31:0]
2830/// result[63:32] := __b[31:0]
2831/// result[95:64] := __a[63:32]
2832/// result[127:96] := __b[63:32]
2833/// result[159:128] := __a[159:128]
2834/// result[191:160] := __b[159:128]
2835/// result[223:192] := __a[191:160]
2836/// result[255:224] := __b[191:190]
2837/// \endcode
2838///
2839/// \headerfile <immintrin.h>
2840///
2841/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2842///
2843/// \param __a
2844/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2845/// elements of the result.
2846/// \param __b
2847/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2848/// elements of the result.
2849/// \returns A 256-bit vector of [8 x i32] containing the result.
2850static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2851_mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
2852 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2853}
2854
2855/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2856/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2857/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2858/// of \a __a and \a __b as input; other bits in these parameters are
2859/// ignored.
2860///
2861/// \code{.operation}
2862/// result[63:0] := __a[63:0]
2863/// result[127:64] := __b[63:0]
2864/// result[191:128] := __a[191:128]
2865/// result[255:192] := __b[191:128]
2866/// \endcode
2867///
2868/// \headerfile <immintrin.h>
2869///
2870/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2871///
2872/// \param __a
2873/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2874/// elements of the result.
2875/// \param __b
2876/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2877/// elements of the result.
2878/// \returns A 256-bit vector of [4 x i64] containing the result.
2879static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2880_mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
2881 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2882}
2883
2884/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2885/// \a __b.
2886///
2887/// \headerfile <immintrin.h>
2888///
2889/// This intrinsic corresponds to the \c VPXOR instruction.
2890///
2891/// \param __a
2892/// A 256-bit integer vector.
2893/// \param __b
2894/// A 256-bit integer vector.
2895/// \returns A 256-bit integer vector containing the result.
2896static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2897_mm256_xor_si256(__m256i __a, __m256i __b)
2898{
2899 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2900}
2901
2902/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2903/// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2904/// boundary.
2905///
2906/// \headerfile <immintrin.h>
2907///
2908/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2909///
2910/// \param __V
2911/// A pointer to the 32-byte aligned memory containing the vector to load.
2912/// \returns A 256-bit integer vector loaded from memory.
2913static __inline__ __m256i __DEFAULT_FN_ATTRS256
2915{
2916 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2917 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2918}
2919
2920/// Broadcasts the 32-bit floating-point value from the low element of the
2921/// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2922/// 128-bit vector of [4 x float].
2923///
2924/// \headerfile <immintrin.h>
2925///
2926/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2927///
2928/// \param __X
2929/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2930/// \returns A 128-bit vector of [4 x float] containing the result.
2931static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
2933 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
2934}
2935
2936/// Broadcasts the 64-bit floating-point value from the low element of the
2937/// 128-bit vector of [2 x double] in \a __a to both elements of the
2938/// result's 128-bit vector of [2 x double].
2939///
2940/// \headerfile <immintrin.h>
2941///
2942/// This intrinsic corresponds to the \c MOVDDUP instruction.
2943///
2944/// \param __a
2945/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2946/// \returns A 128-bit vector of [2 x double] containing the result.
2947static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
2949 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
2950}
2951
2952/// Broadcasts the 32-bit floating-point value from the low element of the
2953/// 128-bit vector of [4 x float] in \a __X to all elements of the
2954/// result's 256-bit vector of [8 x float].
2955///
2956/// \headerfile <immintrin.h>
2957///
2958/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2959///
2960/// \param __X
2961/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2962/// \returns A 256-bit vector of [8 x float] containing the result.
2963static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
2965 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2966}
2967
2968/// Broadcasts the 64-bit floating-point value from the low element of the
2969/// 128-bit vector of [2 x double] in \a __X to all elements of the
2970/// result's 256-bit vector of [4 x double].
2971///
2972/// \headerfile <immintrin.h>
2973///
2974/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
2975///
2976/// \param __X
2977/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2978/// \returns A 256-bit vector of [4 x double] containing the result.
2979static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
2981 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
2982}
2983
2984/// Broadcasts the 128-bit integer data from \a __X to both the lower and
2985/// upper halves of the 256-bit result.
2986///
2987/// \headerfile <immintrin.h>
2988///
2989/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
2990///
2991/// \param __X
2992/// A 128-bit integer vector to be broadcast.
2993/// \returns A 256-bit integer vector containing the result.
2994static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2996 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
2997}
2998
2999#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3000
3001/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3002/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3003/// as specified by the immediate integer operand \a M.
3004///
3005/// \code{.operation}
3006/// FOR i := 0 TO 3
3007/// j := i*32
3008/// IF M[i] == 0
3009/// result[31+j:j] := V1[31+j:j]
3010/// ELSE
3011/// result[31+j:j] := V2[32+j:j]
3012/// FI
3013/// ENDFOR
3014/// \endcode
3015///
3016/// \headerfile <immintrin.h>
3017///
3018/// \code
3019/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3020/// \endcode
3021///
3022/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3023///
3024/// \param V1
3025/// A 128-bit vector of [4 x i32] containing source values.
3026/// \param V2
3027/// A 128-bit vector of [4 x i32] containing source values.
3028/// \param M
3029/// An immediate 8-bit integer operand, with bits [3:0] specifying the
3030/// source for each element of the result. The position of the mask bit
3031/// corresponds to the index of a copied value. When a mask bit is 0, the
3032/// element is copied from \a V1; otherwise, it is copied from \a V2.
3033/// \returns A 128-bit vector of [4 x i32] containing the result.
3034#define _mm_blend_epi32(V1, V2, M) \
3035 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3036 (__v4si)(__m128i)(V2), (int)(M)))
3037
3038/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3039/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3040/// as specified by the immediate integer operand \a M.
3041///
3042/// \code{.operation}
3043/// FOR i := 0 TO 7
3044/// j := i*32
3045/// IF M[i] == 0
3046/// result[31+j:j] := V1[31+j:j]
3047/// ELSE
3048/// result[31+j:j] := V2[32+j:j]
3049/// FI
3050/// ENDFOR
3051/// \endcode
3052///
3053/// \headerfile <immintrin.h>
3054///
3055/// \code
3056/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3057/// \endcode
3058///
3059/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3060///
3061/// \param V1
3062/// A 256-bit vector of [8 x i32] containing source values.
3063/// \param V2
3064/// A 256-bit vector of [8 x i32] containing source values.
3065/// \param M
3066/// An immediate 8-bit integer operand, with bits [7:0] specifying the
3067/// source for each element of the result. The position of the mask bit
3068/// corresponds to the index of a copied value. When a mask bit is 0, the
3069/// element is copied from \a V1; otherwise, it is is copied from \a V2.
3070/// \returns A 256-bit vector of [8 x i32] containing the result.
3071#define _mm256_blend_epi32(V1, V2, M) \
3072 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3073 (__v8si)(__m256i)(V2), (int)(M)))
3074
3075/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3076/// bytes of the 256-bit result.
3077///
3078/// \headerfile <immintrin.h>
3079///
3080/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3081///
3082/// \param __X
3083/// A 128-bit integer vector whose low byte will be broadcast.
3084/// \returns A 256-bit integer vector containing the result.
3085static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3087 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3088}
3089
3090/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3091/// to all elements of the result's 256-bit vector of [16 x i16].
3092///
3093/// \headerfile <immintrin.h>
3094///
3095/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3096///
3097/// \param __X
3098/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3099/// \returns A 256-bit vector of [16 x i16] containing the result.
3100static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3102 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3103}
3104
3105/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3106/// to all elements of the result's 256-bit vector of [8 x i32].
3107///
3108/// \headerfile <immintrin.h>
3109///
3110/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3111///
3112/// \param __X
3113/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3114/// \returns A 256-bit vector of [8 x i32] containing the result.
3115static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3117 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3118}
3119
3120/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3121/// to all elements of the result's 256-bit vector of [4 x i64].
3122///
3123/// \headerfile <immintrin.h>
3124///
3125/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3126///
3127/// \param __X
3128/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3129/// \returns A 256-bit vector of [4 x i64] containing the result.
3130static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3132 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3133}
3134
3135/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3136/// bytes of the 128-bit result.
3137///
3138/// \headerfile <immintrin.h>
3139///
3140/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3141///
3142/// \param __X
3143/// A 128-bit integer vector whose low byte will be broadcast.
3144/// \returns A 128-bit integer vector containing the result.
3145static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3147 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3148}
3149
3150/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3151/// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3152///
3153/// \headerfile <immintrin.h>
3154///
3155/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3156///
3157/// \param __X
3158/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3159/// \returns A 128-bit vector of [8 x i16] containing the result.
3160static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3162 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3163}
3164
3165/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3166/// to all elements of the result's vector of [4 x i32].
3167///
3168/// \headerfile <immintrin.h>
3169///
3170/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3171///
3172/// \param __X
3173/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3174/// \returns A 128-bit vector of [4 x i32] containing the result.
3175static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3177 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3178}
3179
3180/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3181/// to both elements of the result's 128-bit vector of [2 x i64].
3182///
3183/// \headerfile <immintrin.h>
3184///
3185/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3186///
3187/// \param __X
3188/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3189/// \returns A 128-bit vector of [2 x i64] containing the result.
3190static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3192 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3193}
3194
3195/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3196/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3197/// elements of the 256-bit vector of [8 x i32] in \a __b.
3198///
3199/// \code{.operation}
3200/// FOR i := 0 TO 7
3201/// j := i*32
3202/// k := __b[j+2:j] * 32
3203/// result[j+31:j] := __a[k+31:k]
3204/// ENDFOR
3205/// \endcode
3206///
3207/// \headerfile <immintrin.h>
3208///
3209/// This intrinsic corresponds to the \c VPERMD instruction.
3210///
3211/// \param __a
3212/// A 256-bit vector of [8 x i32] containing the source values.
3213/// \param __b
3214/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3215/// \a __a.
3216/// \returns A 256-bit vector of [8 x i32] containing the result.
3217static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3219 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3220}
3221
3222/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3223/// the 256-bit vector of [4 x double] in \a V as specified by the
3224/// immediate value \a M.
3225///
3226/// \code{.operation}
3227/// FOR i := 0 TO 3
3228/// j := i*64
3229/// k := (M >> i*2)[1:0] * 64
3230/// result[j+63:j] := V[k+63:k]
3231/// ENDFOR
3232/// \endcode
3233///
3234/// \headerfile <immintrin.h>
3235///
3236/// \code
3237/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3238/// \endcode
3239///
3240/// This intrinsic corresponds to the \c VPERMPD instruction.
3241///
3242/// \param V
3243/// A 256-bit vector of [4 x double] containing the source values.
3244/// \param M
3245/// An immediate 8-bit value specifying which elements to copy from \a V.
3246/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3247/// \a M[3:2] specifies the index for element 1, and so forth.
3248/// \returns A 256-bit vector of [4 x double] containing the result.
3249#define _mm256_permute4x64_pd(V, M) \
3250 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3251
3252/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3253/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3254/// the elements of the 256-bit vector of [8 x i32] in \a __b.
3255///
3256/// \code{.operation}
3257/// FOR i := 0 TO 7
3258/// j := i*32
3259/// k := __b[j+2:j] * 32
3260/// result[j+31:j] := __a[k+31:k]
3261/// ENDFOR
3262/// \endcode
3263///
3264/// \headerfile <immintrin.h>
3265///
3266/// This intrinsic corresponds to the \c VPERMPS instruction.
3267///
3268/// \param __a
3269/// A 256-bit vector of [8 x float] containing the source values.
3270/// \param __b
3271/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3272/// \a __a.
3273/// \returns A 256-bit vector of [8 x float] containing the result.
3274static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
3276 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3277}
3278
3279/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3280/// of the 256-bit vector of [4 x i64] in \a V as specified by the
3281/// immediate value \a M.
3282///
3283/// \code{.operation}
3284/// FOR i := 0 TO 3
3285/// j := i*64
3286/// k := (M >> i*2)[1:0] * 64
3287/// result[j+63:j] := V[k+63:k]
3288/// ENDFOR
3289/// \endcode
3290///
3291/// \headerfile <immintrin.h>
3292///
3293/// \code
3294/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3295/// \endcode
3296///
3297/// This intrinsic corresponds to the \c VPERMQ instruction.
3298///
3299/// \param V
3300/// A 256-bit vector of [4 x i64] containing the source values.
3301/// \param M
3302/// An immediate 8-bit value specifying which elements to copy from \a V.
3303/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3304/// \a M[3:2] specifies the index for element 1, and so forth.
3305/// \returns A 256-bit vector of [4 x i64] containing the result.
3306#define _mm256_permute4x64_epi64(V, M) \
3307 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3308
3309/// Sets each half of the 256-bit result either to zero or to one of the
3310/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3311/// as specified by the immediate value \a M.
3312///
3313/// \code{.operation}
3314/// FOR i := 0 TO 1
3315/// j := i*128
3316/// k := M >> (i*4)
3317/// IF k[3] == 0
3318/// CASE (k[1:0]) OF
3319/// 0: result[127+j:j] := V1[127:0]
3320/// 1: result[127+j:j] := V1[255:128]
3321/// 2: result[127+j:j] := V2[127:0]
3322/// 3: result[127+j:j] := V2[255:128]
3323/// ESAC
3324/// ELSE
3325/// result[127+j:j] := 0
3326/// FI
3327/// ENDFOR
3328/// \endcode
3329///
3330/// \headerfile <immintrin.h>
3331///
3332/// \code
3333/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3334/// \endcode
3335///
3336/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3337///
3338/// \param V1
3339/// A 256-bit integer vector containing source values.
3340/// \param V2
3341/// A 256-bit integer vector containing source values.
3342/// \param M
3343/// An immediate value specifying how to form the result. Bits [3:0]
3344/// control the lower half of the result, bits [7:4] control the upper half.
3345/// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3346/// otherwise bits [1:0] determine the source as follows. \n
3347/// 0: the lower half of \a V1 \n
3348/// 1: the upper half of \a V1 \n
3349/// 2: the lower half of \a V2 \n
3350/// 3: the upper half of \a V2
3351/// \returns A 256-bit integer vector containing the result.
3352#define _mm256_permute2x128_si256(V1, V2, M) \
3353 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3354
3355/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3356/// of the immediate \a M is zero, extracts the lower half of the result;
3357/// otherwise, extracts the upper half.
3358///
3359/// \headerfile <immintrin.h>
3360///
3361/// \code
3362/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3363/// \endcode
3364///
3365/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3366///
3367/// \param V
3368/// A 256-bit integer vector containing the source values.
3369/// \param M
3370/// An immediate value specifying which half of \a V to extract.
3371/// \returns A 128-bit integer vector containing the result.
3372#define _mm256_extracti128_si256(V, M) \
3373 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3374
3375/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3376/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3377/// is zero, overwrites the lower half of the result; otherwise,
3378/// overwrites the upper half.
3379///
3380/// \headerfile <immintrin.h>
3381///
3382/// \code
3383/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3384/// \endcode
3385///
3386/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3387///
3388/// \param V1
3389/// A 256-bit integer vector containing a source value.
3390/// \param V2
3391/// A 128-bit integer vector containing a source value.
3392/// \param M
3393/// An immediate value specifying where to put \a V2 in the result.
3394/// \returns A 256-bit integer vector containing the result.
3395#define _mm256_inserti128_si256(V1, V2, M) \
3396 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3397 (__v2di)(__m128i)(V2), (int)(M)))
3398
3399/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3400/// the most significant bit of the corresponding element in the mask
3401/// \a __M is set; otherwise, sets that element of the result to zero.
3402/// Returns the 256-bit [8 x i32] result.
3403///
3404/// \code{.operation}
3405/// FOR i := 0 TO 7
3406/// j := i*32
3407/// IF __M[j+31] == 1
3408/// result[j+31:j] := Load32(__X+(i*4))
3409/// ELSE
3410/// result[j+31:j] := 0
3411/// FI
3412/// ENDFOR
3413/// \endcode
3414///
3415/// \headerfile <immintrin.h>
3416///
3417/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3418///
3419/// \param __X
3420/// A pointer to the memory used for loading values.
3421/// \param __M
3422/// A 256-bit vector of [8 x i32] containing the mask bits.
3423/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3424/// elements.
3425static __inline__ __m256i __DEFAULT_FN_ATTRS256
3426_mm256_maskload_epi32(int const *__X, __m256i __M)
3427{
3428 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3429}
3430
3431/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3432/// the most significant bit of the corresponding element in the mask
3433/// \a __M is set; otherwise, sets that element of the result to zero.
3434/// Returns the 256-bit [4 x i64] result.
3435///
3436/// \code{.operation}
3437/// FOR i := 0 TO 3
3438/// j := i*64
3439/// IF __M[j+63] == 1
3440/// result[j+63:j] := Load64(__X+(i*8))
3441/// ELSE
3442/// result[j+63:j] := 0
3443/// FI
3444/// ENDFOR
3445/// \endcode
3446///
3447/// \headerfile <immintrin.h>
3448///
3449/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3450///
3451/// \param __X
3452/// A pointer to the memory used for loading values.
3453/// \param __M
3454/// A 256-bit vector of [4 x i64] containing the mask bits.
3455/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3456/// elements.
3457static __inline__ __m256i __DEFAULT_FN_ATTRS256
3458_mm256_maskload_epi64(long long const *__X, __m256i __M)
3459{
3460 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3461}
3462
3463/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3464/// the most significant bit of the corresponding element in the mask
3465/// \a __M is set; otherwise, sets that element of the result to zero.
3466/// Returns the 128-bit [4 x i32] result.
3467///
3468/// \code{.operation}
3469/// FOR i := 0 TO 3
3470/// j := i*32
3471/// IF __M[j+31] == 1
3472/// result[j+31:j] := Load32(__X+(i*4))
3473/// ELSE
3474/// result[j+31:j] := 0
3475/// FI
3476/// ENDFOR
3477/// \endcode
3478///
3479/// \headerfile <immintrin.h>
3480///
3481/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3482///
3483/// \param __X
3484/// A pointer to the memory used for loading values.
3485/// \param __M
3486/// A 128-bit vector of [4 x i32] containing the mask bits.
3487/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3488/// elements.
3489static __inline__ __m128i __DEFAULT_FN_ATTRS128
3490_mm_maskload_epi32(int const *__X, __m128i __M)
3491{
3492 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3493}
3494
3495/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3496/// the most significant bit of the corresponding element in the mask
3497/// \a __M is set; otherwise, sets that element of the result to zero.
3498/// Returns the 128-bit [2 x i64] result.
3499///
3500/// \code{.operation}
3501/// FOR i := 0 TO 1
3502/// j := i*64
3503/// IF __M[j+63] == 1
3504/// result[j+63:j] := Load64(__X+(i*8))
3505/// ELSE
3506/// result[j+63:j] := 0
3507/// FI
3508/// ENDFOR
3509/// \endcode
3510///
3511/// \headerfile <immintrin.h>
3512///
3513/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3514///
3515/// \param __X
3516/// A pointer to the memory used for loading values.
3517/// \param __M
3518/// A 128-bit vector of [2 x i64] containing the mask bits.
3519/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3520/// elements.
3521static __inline__ __m128i __DEFAULT_FN_ATTRS128
3522_mm_maskload_epi64(long long const *__X, __m128i __M)
3523{
3524 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3525}
3526
3527/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3528/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3529/// the corresponding element in the mask \a __M is set; otherwise, the
3530/// memory element is unchanged.
3531///
3532/// \code{.operation}
3533/// FOR i := 0 TO 7
3534/// j := i*32
3535/// IF __M[j+31] == 1
3536/// Store32(__X+(i*4), __Y[j+31:j])
3537/// FI
3538/// ENDFOR
3539/// \endcode
3540///
3541/// \headerfile <immintrin.h>
3542///
3543/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3544///
3545/// \param __X
3546/// A pointer to the memory used for storing values.
3547/// \param __M
3548/// A 256-bit vector of [8 x i32] containing the mask bits.
3549/// \param __Y
3550/// A 256-bit vector of [8 x i32] containing the values to store.
3551static __inline__ void __DEFAULT_FN_ATTRS256
3552_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3553{
3554 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3555}
3556
3557/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3558/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3559/// the corresponding element in the mask \a __M is set; otherwise, the
3560/// memory element is unchanged.
3561///
3562/// \code{.operation}
3563/// FOR i := 0 TO 3
3564/// j := i*64
3565/// IF __M[j+63] == 1
3566/// Store64(__X+(i*8), __Y[j+63:j])
3567/// FI
3568/// ENDFOR
3569/// \endcode
3570///
3571/// \headerfile <immintrin.h>
3572///
3573/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3574///
3575/// \param __X
3576/// A pointer to the memory used for storing values.
3577/// \param __M
3578/// A 256-bit vector of [4 x i64] containing the mask bits.
3579/// \param __Y
3580/// A 256-bit vector of [4 x i64] containing the values to store.
3581static __inline__ void __DEFAULT_FN_ATTRS256
3582_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3583{
3584 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3585}
3586
3587/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3588/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3589/// the corresponding element in the mask \a __M is set; otherwise, the
3590/// memory element is unchanged.
3591///
3592/// \code{.operation}
3593/// FOR i := 0 TO 3
3594/// j := i*32
3595/// IF __M[j+31] == 1
3596/// Store32(__X+(i*4), __Y[j+31:j])
3597/// FI
3598/// ENDFOR
3599/// \endcode
3600///
3601/// \headerfile <immintrin.h>
3602///
3603/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3604///
3605/// \param __X
3606/// A pointer to the memory used for storing values.
3607/// \param __M
3608/// A 128-bit vector of [4 x i32] containing the mask bits.
3609/// \param __Y
3610/// A 128-bit vector of [4 x i32] containing the values to store.
3611static __inline__ void __DEFAULT_FN_ATTRS128
3612_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3613{
3614 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3615}
3616
3617/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3618/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3619/// the corresponding element in the mask \a __M is set; otherwise, the
3620/// memory element is unchanged.
3621///
3622/// \code{.operation}
3623/// FOR i := 0 TO 1
3624/// j := i*64
3625/// IF __M[j+63] == 1
3626/// Store64(__X+(i*8), __Y[j+63:j])
3627/// FI
3628/// ENDFOR
3629/// \endcode
3630///
3631/// \headerfile <immintrin.h>
3632///
3633/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3634///
3635/// \param __X
3636/// A pointer to the memory used for storing values.
3637/// \param __M
3638/// A 128-bit vector of [2 x i64] containing the mask bits.
3639/// \param __Y
3640/// A 128-bit vector of [2 x i64] containing the values to store.
3641static __inline__ void __DEFAULT_FN_ATTRS128
3642_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3643{
3644 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3645}
3646
3647/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3648/// left by the number of bits given in the corresponding element of the
3649/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3650/// returns the result. If the shift count for any element is greater than
3651/// 31, the result for that element is zero.
3652///
3653/// \headerfile <immintrin.h>
3654///
3655/// This intrinsic corresponds to the \c VPSLLVD instruction.
3656///
3657/// \param __X
3658/// A 256-bit vector of [8 x i32] to be shifted.
3659/// \param __Y
3660/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3661/// bits).
3662/// \returns A 256-bit vector of [8 x i32] containing the result.
3663static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3664_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3665{
3666 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3667}
3668
3669/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3670/// left by the number of bits given in the corresponding element of the
3671/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3672/// returns the result. If the shift count for any element is greater than
3673/// 31, the result for that element is zero.
3674///
3675/// \headerfile <immintrin.h>
3676///
3677/// This intrinsic corresponds to the \c VPSLLVD instruction.
3678///
3679/// \param __X
3680/// A 128-bit vector of [4 x i32] to be shifted.
3681/// \param __Y
3682/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3683/// bits).
3684/// \returns A 128-bit vector of [4 x i32] containing the result.
3685static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3686_mm_sllv_epi32(__m128i __X, __m128i __Y)
3687{
3688 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3689}
3690
3691/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3692/// left by the number of bits given in the corresponding element of the
3693/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3694/// returns the result. If the shift count for any element is greater than
3695/// 63, the result for that element is zero.
3696///
3697/// \headerfile <immintrin.h>
3698///
3699/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3700///
3701/// \param __X
3702/// A 256-bit vector of [4 x i64] to be shifted.
3703/// \param __Y
3704/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3705/// bits).
3706/// \returns A 256-bit vector of [4 x i64] containing the result.
3707static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3708_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3709{
3710 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3711}
3712
3713/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3714/// left by the number of bits given in the corresponding element of the
3715/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3716/// returns the result. If the shift count for any element is greater than
3717/// 63, the result for that element is zero.
3718///
3719/// \headerfile <immintrin.h>
3720///
3721/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3722///
3723/// \param __X
3724/// A 128-bit vector of [2 x i64] to be shifted.
3725/// \param __Y
3726/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3727/// bits).
3728/// \returns A 128-bit vector of [2 x i64] containing the result.
3729static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3730_mm_sllv_epi64(__m128i __X, __m128i __Y)
3731{
3732 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3733}
3734
3735/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3736/// right by the number of bits given in the corresponding element of the
3737/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3738/// returns the result. If the shift count for any element is greater than
3739/// 31, the result for that element is 0 or -1 according to the sign bit
3740/// for that element.
3741///
3742/// \headerfile <immintrin.h>
3743///
3744/// This intrinsic corresponds to the \c VPSRAVD instruction.
3745///
3746/// \param __X
3747/// A 256-bit vector of [8 x i32] to be shifted.
3748/// \param __Y
3749/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3750/// bits).
3751/// \returns A 256-bit vector of [8 x i32] containing the result.
3752static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3753_mm256_srav_epi32(__m256i __X, __m256i __Y)
3754{
3755 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3756}
3757
3758/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3759/// right by the number of bits given in the corresponding element of the
3760/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3761/// returns the result. If the shift count for any element is greater than
3762/// 31, the result for that element is 0 or -1 according to the sign bit
3763/// for that element.
3764///
3765/// \headerfile <immintrin.h>
3766///
3767/// This intrinsic corresponds to the \c VPSRAVD instruction.
3768///
3769/// \param __X
3770/// A 128-bit vector of [4 x i32] to be shifted.
3771/// \param __Y
3772/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3773/// bits).
3774/// \returns A 128-bit vector of [4 x i32] containing the result.
3775static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3776_mm_srav_epi32(__m128i __X, __m128i __Y)
3777{
3778 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3779}
3780
3781/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3782/// right by the number of bits given in the corresponding element of the
3783/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3784/// returns the result. If the shift count for any element is greater than
3785/// 31, the result for that element is zero.
3786///
3787/// \headerfile <immintrin.h>
3788///
3789/// This intrinsic corresponds to the \c VPSRLVD instruction.
3790///
3791/// \param __X
3792/// A 256-bit vector of [8 x i32] to be shifted.
3793/// \param __Y
3794/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3795/// bits).
3796/// \returns A 256-bit vector of [8 x i32] containing the result.
3797static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3798_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3799{
3800 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3801}
3802
3803/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3804/// right by the number of bits given in the corresponding element of the
3805/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3806/// returns the result. If the shift count for any element is greater than
3807/// 31, the result for that element is zero.
3808///
3809/// \headerfile <immintrin.h>
3810///
3811/// This intrinsic corresponds to the \c VPSRLVD instruction.
3812///
3813/// \param __X
3814/// A 128-bit vector of [4 x i32] to be shifted.
3815/// \param __Y
3816/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3817/// bits).
3818/// \returns A 128-bit vector of [4 x i32] containing the result.
3819static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3820_mm_srlv_epi32(__m128i __X, __m128i __Y)
3821{
3822 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3823}
3824
3825/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3826/// right by the number of bits given in the corresponding element of the
3827/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3828/// returns the result. If the shift count for any element is greater than
3829/// 63, the result for that element is zero.
3830///
3831/// \headerfile <immintrin.h>
3832///
3833/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3834///
3835/// \param __X
3836/// A 256-bit vector of [4 x i64] to be shifted.
3837/// \param __Y
3838/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3839/// bits).
3840/// \returns A 256-bit vector of [4 x i64] containing the result.
3841static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3842_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3843{
3844 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3845}
3846
3847/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3848/// right by the number of bits given in the corresponding element of the
3849/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3850/// returns the result. If the shift count for any element is greater than
3851/// 63, the result for that element is zero.
3852///
3853/// \headerfile <immintrin.h>
3854///
3855/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3856///
3857/// \param __X
3858/// A 128-bit vector of [2 x i64] to be shifted.
3859/// \param __Y
3860/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3861/// bits).
3862/// \returns A 128-bit vector of [2 x i64] containing the result.
3863static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3864_mm_srlv_epi64(__m128i __X, __m128i __Y)
3865{
3866 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3867}
3868
3869/// Conditionally gathers two 64-bit floating-point values, either from the
3870/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3871/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3872/// of [2 x double] in \a mask determines the source for each element.
3873///
3874/// \code{.operation}
3875/// FOR element := 0 to 1
3876/// j := element*64
3877/// k := element*32
3878/// IF mask[j+63] == 0
3879/// result[j+63:j] := a[j+63:j]
3880/// ELSE
3881/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3882/// FI
3883/// ENDFOR
3884/// \endcode
3885///
3886/// \headerfile <immintrin.h>
3887///
3888/// \code
3889/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3890/// __m128d mask, const int s);
3891/// \endcode
3892///
3893/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3894///
3895/// \param a
3896/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3897/// zero.
3898/// \param m
3899/// A pointer to the memory used for loading values.
3900/// \param i
3901/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3902/// the first two elements are used.
3903/// \param mask
3904/// A 128-bit vector of [2 x double] containing the mask. The most
3905/// significant bit of each element in the mask vector represents the mask
3906/// bits. If a mask bit is zero, the corresponding value from vector \a a
3907/// is gathered; otherwise the value is loaded from memory.
3908/// \param s
3909/// A literal constant scale factor for the indexes in \a i. Must be
3910/// 1, 2, 4, or 8.
3911/// \returns A 128-bit vector of [2 x double] containing the gathered values.
3912#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
3913 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
3914 (double const *)(m), \
3915 (__v4si)(__m128i)(i), \
3916 (__v2df)(__m128d)(mask), (s)))
3917
3918/// Conditionally gathers four 64-bit floating-point values, either from the
3919/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
3920/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
3921/// of [4 x double] in \a mask determines the source for each element.
3922///
3923/// \code{.operation}
3924/// FOR element := 0 to 3
3925/// j := element*64
3926/// k := element*32
3927/// IF mask[j+63] == 0
3928/// result[j+63:j] := a[j+63:j]
3929/// ELSE
3930/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3931/// FI
3932/// ENDFOR
3933/// \endcode
3934///
3935/// \headerfile <immintrin.h>
3936///
3937/// \code
3938/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
3939/// __m256d mask, const int s);
3940/// \endcode
3941///
3942/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3943///
3944/// \param a
3945/// A 256-bit vector of [4 x double] used as the source when a mask bit is
3946/// zero.
3947/// \param m
3948/// A pointer to the memory used for loading values.
3949/// \param i
3950/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3951/// \param mask
3952/// A 256-bit vector of [4 x double] containing the mask. The most
3953/// significant bit of each element in the mask vector represents the mask
3954/// bits. If a mask bit is zero, the corresponding value from vector \a a
3955/// is gathered; otherwise the value is loaded from memory.
3956/// \param s
3957/// A literal constant scale factor for the indexes in \a i. Must be
3958/// 1, 2, 4, or 8.
3959/// \returns A 256-bit vector of [4 x double] containing the gathered values.
3960#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
3961 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
3962 (double const *)(m), \
3963 (__v4si)(__m128i)(i), \
3964 (__v4df)(__m256d)(mask), (s)))
3965
3966/// Conditionally gathers two 64-bit floating-point values, either from the
3967/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3968/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3969/// of [2 x double] in \a mask determines the source for each element.
3970///
3971/// \code{.operation}
3972/// FOR element := 0 to 1
3973/// j := element*64
3974/// k := element*64
3975/// IF mask[j+63] == 0
3976/// result[j+63:j] := a[j+63:j]
3977/// ELSE
3978/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3979/// FI
3980/// ENDFOR
3981/// \endcode
3982///
3983/// \headerfile <immintrin.h>
3984///
3985/// \code
3986/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
3987/// __m128d mask, const int s);
3988/// \endcode
3989///
3990/// This intrinsic corresponds to the \c VGATHERQPD instruction.
3991///
3992/// \param a
3993/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3994/// zero.
3995/// \param m
3996/// A pointer to the memory used for loading values.
3997/// \param i
3998/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3999/// \param mask
4000/// A 128-bit vector of [2 x double] containing the mask. The most
4001/// significant bit of each element in the mask vector represents the mask
4002/// bits. If a mask bit is zero, the corresponding value from vector \a a
4003/// is gathered; otherwise the value is loaded from memory.
4004/// \param s
4005/// A literal constant scale factor for the indexes in \a i. Must be
4006/// 1, 2, 4, or 8.
4007/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4008#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4009 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4010 (double const *)(m), \
4011 (__v2di)(__m128i)(i), \
4012 (__v2df)(__m128d)(mask), (s)))
4013
4014/// Conditionally gathers four 64-bit floating-point values, either from the
4015/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4016/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4017/// of [4 x double] in \a mask determines the source for each element.
4018///
4019/// \code{.operation}
4020/// FOR element := 0 to 3
4021/// j := element*64
4022/// k := element*64
4023/// IF mask[j+63] == 0
4024/// result[j+63:j] := a[j+63:j]
4025/// ELSE
4026/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4027/// FI
4028/// ENDFOR
4029/// \endcode
4030///
4031/// \headerfile <immintrin.h>
4032///
4033/// \code
4034/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4035/// __m256d mask, const int s);
4036/// \endcode
4037///
4038/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4039///
4040/// \param a
4041/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4042/// zero.
4043/// \param m
4044/// A pointer to the memory used for loading values.
4045/// \param i
4046/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4047/// \param mask
4048/// A 256-bit vector of [4 x double] containing the mask. The most
4049/// significant bit of each element in the mask vector represents the mask
4050/// bits. If a mask bit is zero, the corresponding value from vector \a a
4051/// is gathered; otherwise the value is loaded from memory.
4052/// \param s
4053/// A literal constant scale factor for the indexes in \a i. Must be
4054/// 1, 2, 4, or 8.
4055/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4056#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4057 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4058 (double const *)(m), \
4059 (__v4di)(__m256i)(i), \
4060 (__v4df)(__m256d)(mask), (s)))
4061
4062/// Conditionally gathers four 32-bit floating-point values, either from the
4063/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4064/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4065/// of [4 x float] in \a mask determines the source for each element.
4066///
4067/// \code{.operation}
4068/// FOR element := 0 to 3
4069/// j := element*32
4070/// k := element*32
4071/// IF mask[j+31] == 0
4072/// result[j+31:j] := a[j+31:j]
4073/// ELSE
4074/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4075/// FI
4076/// ENDFOR
4077/// \endcode
4078///
4079/// \headerfile <immintrin.h>
4080///
4081/// \code
4082/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4083/// __m128 mask, const int s);
4084/// \endcode
4085///
4086/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4087///
4088/// \param a
4089/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4090/// zero.
4091/// \param m
4092/// A pointer to the memory used for loading values.
4093/// \param i
4094/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4095/// \param mask
4096/// A 128-bit vector of [4 x float] containing the mask. The most
4097/// significant bit of each element in the mask vector represents the mask
4098/// bits. If a mask bit is zero, the corresponding value from vector \a a
4099/// is gathered; otherwise the value is loaded from memory.
4100/// \param s
4101/// A literal constant scale factor for the indexes in \a i. Must be
4102/// 1, 2, 4, or 8.
4103/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4104#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4105 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4106 (float const *)(m), \
4107 (__v4si)(__m128i)(i), \
4108 (__v4sf)(__m128)(mask), (s)))
4109
4110/// Conditionally gathers eight 32-bit floating-point values, either from the
4111/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4112/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4113/// of [8 x float] in \a mask determines the source for each element.
4114///
4115/// \code{.operation}
4116/// FOR element := 0 to 7
4117/// j := element*32
4118/// k := element*32
4119/// IF mask[j+31] == 0
4120/// result[j+31:j] := a[j+31:j]
4121/// ELSE
4122/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4123/// FI
4124/// ENDFOR
4125/// \endcode
4126///
4127/// \headerfile <immintrin.h>
4128///
4129/// \code
4130/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4131/// __m256 mask, const int s);
4132/// \endcode
4133///
4134/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4135///
4136/// \param a
4137/// A 256-bit vector of [8 x float] used as the source when a mask bit is
4138/// zero.
4139/// \param m
4140/// A pointer to the memory used for loading values.
4141/// \param i
4142/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4143/// \param mask
4144/// A 256-bit vector of [8 x float] containing the mask. The most
4145/// significant bit of each element in the mask vector represents the mask
4146/// bits. If a mask bit is zero, the corresponding value from vector \a a
4147/// is gathered; otherwise the value is loaded from memory.
4148/// \param s
4149/// A literal constant scale factor for the indexes in \a i. Must be
4150/// 1, 2, 4, or 8.
4151/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4152#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4153 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4154 (float const *)(m), \
4155 (__v8si)(__m256i)(i), \
4156 (__v8sf)(__m256)(mask), (s)))
4157
4158/// Conditionally gathers two 32-bit floating-point values, either from the
4159/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4160/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4161/// of [4 x float] in \a mask determines the source for the lower two
4162/// elements. The upper two elements of the result are zeroed.
4163///
4164/// \code{.operation}
4165/// FOR element := 0 to 1
4166/// j := element*32
4167/// k := element*64
4168/// IF mask[j+31] == 0
4169/// result[j+31:j] := a[j+31:j]
4170/// ELSE
4171/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4172/// FI
4173/// ENDFOR
4174/// result[127:64] := 0
4175/// \endcode
4176///
4177/// \headerfile <immintrin.h>
4178///
4179/// \code
4180/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4181/// __m128 mask, const int s);
4182/// \endcode
4183///
4184/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4185///
4186/// \param a
4187/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4188/// zero. Only the first two elements are used.
4189/// \param m
4190/// A pointer to the memory used for loading values.
4191/// \param i
4192/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4193/// \param mask
4194/// A 128-bit vector of [4 x float] containing the mask. The most
4195/// significant bit of each element in the mask vector represents the mask
4196/// bits. If a mask bit is zero, the corresponding value from vector \a a
4197/// is gathered; otherwise the value is loaded from memory. Only the first
4198/// two elements are used.
4199/// \param s
4200/// A literal constant scale factor for the indexes in \a i. Must be
4201/// 1, 2, 4, or 8.
4202/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4203#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4204 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4205 (float const *)(m), \
4206 (__v2di)(__m128i)(i), \
4207 (__v4sf)(__m128)(mask), (s)))
4208
4209/// Conditionally gathers four 32-bit floating-point values, either from the
4210/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4211/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4212/// of [4 x float] in \a mask determines the source for each element.
4213///
4214/// \code{.operation}
4215/// FOR element := 0 to 3
4216/// j := element*32
4217/// k := element*64
4218/// IF mask[j+31] == 0
4219/// result[j+31:j] := a[j+31:j]
4220/// ELSE
4221/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4222/// FI
4223/// ENDFOR
4224/// \endcode
4225///
4226/// \headerfile <immintrin.h>
4227///
4228/// \code
4229/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4230/// __m128 mask, const int s);
4231/// \endcode
4232///
4233/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4234///
4235/// \param a
4236/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4237/// zero.
4238/// \param m
4239/// A pointer to the memory used for loading values.
4240/// \param i
4241/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4242/// \param mask
4243/// A 128-bit vector of [4 x float] containing the mask. The most
4244/// significant bit of each element in the mask vector represents the mask
4245/// bits. If a mask bit is zero, the corresponding value from vector \a a
4246/// is gathered; otherwise the value is loaded from memory.
4247/// \param s
4248/// A literal constant scale factor for the indexes in \a i. Must be
4249/// 1, 2, 4, or 8.
4250/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4251#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4252 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4253 (float const *)(m), \
4254 (__v4di)(__m256i)(i), \
4255 (__v4sf)(__m128)(mask), (s)))
4256
4257/// Conditionally gathers four 32-bit integer values, either from the
4258/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4259/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4260/// of [4 x i32] in \a mask determines the source for each element.
4261///
4262/// \code{.operation}
4263/// FOR element := 0 to 3
4264/// j := element*32
4265/// k := element*32
4266/// IF mask[j+31] == 0
4267/// result[j+31:j] := a[j+31:j]
4268/// ELSE
4269/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4270/// FI
4271/// ENDFOR
4272/// \endcode
4273///
4274/// \headerfile <immintrin.h>
4275///
4276/// \code
4277/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4278/// __m128i mask, const int s);
4279/// \endcode
4280///
4281/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4282///
4283/// \param a
4284/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4285/// zero.
4286/// \param m
4287/// A pointer to the memory used for loading values.
4288/// \param i
4289/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4290/// \param mask
4291/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4292/// bit of each element in the mask vector represents the mask bits. If a
4293/// mask bit is zero, the corresponding value from vector \a a is gathered;
4294/// otherwise the value is loaded from memory.
4295/// \param s
4296/// A literal constant scale factor for the indexes in \a i. Must be
4297/// 1, 2, 4, or 8.
4298/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4299#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4300 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4301 (int const *)(m), \
4302 (__v4si)(__m128i)(i), \
4303 (__v4si)(__m128i)(mask), (s)))
4304
4305/// Conditionally gathers eight 32-bit integer values, either from the
4306/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4307/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4308/// of [8 x i32] in \a mask determines the source for each element.
4309///
4310/// \code{.operation}
4311/// FOR element := 0 to 7
4312/// j := element*32
4313/// k := element*32
4314/// IF mask[j+31] == 0
4315/// result[j+31:j] := a[j+31:j]
4316/// ELSE
4317/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4318/// FI
4319/// ENDFOR
4320/// \endcode
4321///
4322/// \headerfile <immintrin.h>
4323///
4324/// \code
4325/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4326/// __m256i mask, const int s);
4327/// \endcode
4328///
4329/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4330///
4331/// \param a
4332/// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4333/// zero.
4334/// \param m
4335/// A pointer to the memory used for loading values.
4336/// \param i
4337/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4338/// \param mask
4339/// A 256-bit vector of [8 x i32] containing the mask. The most significant
4340/// bit of each element in the mask vector represents the mask bits. If a
4341/// mask bit is zero, the corresponding value from vector \a a is gathered;
4342/// otherwise the value is loaded from memory.
4343/// \param s
4344/// A literal constant scale factor for the indexes in \a i. Must be
4345/// 1, 2, 4, or 8.
4346/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4347#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4348 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4349 (int const *)(m), \
4350 (__v8si)(__m256i)(i), \
4351 (__v8si)(__m256i)(mask), (s)))
4352
4353/// Conditionally gathers two 32-bit integer values, either from the
4354/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4355/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4356/// of [4 x i32] in \a mask determines the source for the lower two
4357/// elements. The upper two elements of the result are zeroed.
4358///
4359/// \code{.operation}
4360/// FOR element := 0 to 1
4361/// j := element*32
4362/// k := element*64
4363/// IF mask[j+31] == 0
4364/// result[j+31:j] := a[j+31:j]
4365/// ELSE
4366/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4367/// FI
4368/// ENDFOR
4369/// result[127:64] := 0
4370/// \endcode
4371///
4372/// \headerfile <immintrin.h>
4373///
4374/// \code
4375/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4376/// __m128i mask, const int s);
4377/// \endcode
4378///
4379/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4380///
4381/// \param a
4382/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4383/// zero. Only the first two elements are used.
4384/// \param m
4385/// A pointer to the memory used for loading values.
4386/// \param i
4387/// A 128-bit vector of [2 x i64] containing indexes into \a m.
4388/// \param mask
4389/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4390/// bit of each element in the mask vector represents the mask bits. If a
4391/// mask bit is zero, the corresponding value from vector \a a is gathered;
4392/// otherwise the value is loaded from memory. Only the first two elements
4393/// are used.
4394/// \param s
4395/// A literal constant scale factor for the indexes in \a i. Must be
4396/// 1, 2, 4, or 8.
4397/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4398#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4399 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4400 (int const *)(m), \
4401 (__v2di)(__m128i)(i), \
4402 (__v4si)(__m128i)(mask), (s)))
4403
4404/// Conditionally gathers four 32-bit integer values, either from the
4405/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4406/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4407/// of [4 x i32] in \a mask determines the source for each element.
4408///
4409/// \code{.operation}
4410/// FOR element := 0 to 3
4411/// j := element*32
4412/// k := element*64
4413/// IF mask[j+31] == 0
4414/// result[j+31:j] := a[j+31:j]
4415/// ELSE
4416/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4417/// FI
4418/// ENDFOR
4419/// \endcode
4420///
4421/// \headerfile <immintrin.h>
4422///
4423/// \code
4424/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4425/// __m128i mask, const int s);
4426/// \endcode
4427///
4428/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4429///
4430/// \param a
4431/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4432/// zero.
4433/// \param m
4434/// A pointer to the memory used for loading values.
4435/// \param i
4436/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4437/// \param mask
4438/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4439/// bit of each element in the mask vector represents the mask bits. If a
4440/// mask bit is zero, the corresponding value from vector \a a is gathered;
4441/// otherwise the value is loaded from memory.
4442/// \param s
4443/// A literal constant scale factor for the indexes in \a i. Must be
4444/// 1, 2, 4, or 8.
4445/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4446#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4447 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4448 (int const *)(m), \
4449 (__v4di)(__m256i)(i), \
4450 (__v4si)(__m128i)(mask), (s)))
4451
4452/// Conditionally gathers two 64-bit integer values, either from the
4453/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4454/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4455/// of [2 x i64] in \a mask determines the source for each element.
4456///
4457/// \code{.operation}
4458/// FOR element := 0 to 1
4459/// j := element*64
4460/// k := element*32
4461/// IF mask[j+63] == 0
4462/// result[j+63:j] := a[j+63:j]
4463/// ELSE
4464/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4465/// FI
4466/// ENDFOR
4467/// \endcode
4468///
4469/// \headerfile <immintrin.h>
4470///
4471/// \code
4472/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4473/// __m128i mask, const int s);
4474/// \endcode
4475///
4476/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4477///
4478/// \param a
4479/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4480/// zero.
4481/// \param m
4482/// A pointer to the memory used for loading values.
4483/// \param i
4484/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4485/// the first two elements are used.
4486/// \param mask
4487/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4488/// bit of each element in the mask vector represents the mask bits. If a
4489/// mask bit is zero, the corresponding value from vector \a a is gathered;
4490/// otherwise the value is loaded from memory.
4491/// \param s
4492/// A literal constant scale factor for the indexes in \a i. Must be
4493/// 1, 2, 4, or 8.
4494/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4495#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4496 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4497 (long long const *)(m), \
4498 (__v4si)(__m128i)(i), \
4499 (__v2di)(__m128i)(mask), (s)))
4500
4501/// Conditionally gathers four 64-bit integer values, either from the
4502/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4503/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4504/// of [4 x i64] in \a mask determines the source for each element.
4505///
4506/// \code{.operation}
4507/// FOR element := 0 to 3
4508/// j := element*64
4509/// k := element*32
4510/// IF mask[j+63] == 0
4511/// result[j+63:j] := a[j+63:j]
4512/// ELSE
4513/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4514/// FI
4515/// ENDFOR
4516/// \endcode
4517///
4518/// \headerfile <immintrin.h>
4519///
4520/// \code
4521/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4522/// __m128i i, __m256i mask, const int s);
4523/// \endcode
4524///
4525/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4526///
4527/// \param a
4528/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4529/// zero.
4530/// \param m
4531/// A pointer to the memory used for loading values.
4532/// \param i
4533/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4534/// \param mask
4535/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4536/// bit of each element in the mask vector represents the mask bits. If a
4537/// mask bit is zero, the corresponding value from vector \a a is gathered;
4538/// otherwise the value is loaded from memory.
4539/// \param s
4540/// A literal constant scale factor for the indexes in \a i. Must be
4541/// 1, 2, 4, or 8.
4542/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4543#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4544 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4545 (long long const *)(m), \
4546 (__v4si)(__m128i)(i), \
4547 (__v4di)(__m256i)(mask), (s)))
4548
4549/// Conditionally gathers two 64-bit integer values, either from the
4550/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4551/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4552/// of [2 x i64] in \a mask determines the source for each element.
4553///
4554/// \code{.operation}
4555/// FOR element := 0 to 1
4556/// j := element*64
4557/// k := element*64
4558/// IF mask[j+63] == 0
4559/// result[j+63:j] := a[j+63:j]
4560/// ELSE
4561/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4562/// FI
4563/// ENDFOR
4564/// \endcode
4565///
4566/// \headerfile <immintrin.h>
4567///
4568/// \code
4569/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4570/// __m128i mask, const int s);
4571/// \endcode
4572///
4573/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4574///
4575/// \param a
4576/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4577/// zero.
4578/// \param m
4579/// A pointer to the memory used for loading values.
4580/// \param i
4581/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4582/// \param mask
4583/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4584/// bit of each element in the mask vector represents the mask bits. If a
4585/// mask bit is zero, the corresponding value from vector \a a is gathered;
4586/// otherwise the value is loaded from memory.
4587/// \param s
4588/// A literal constant scale factor for the indexes in \a i. Must be
4589/// 1, 2, 4, or 8.
4590/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4591#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4592 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4593 (long long const *)(m), \
4594 (__v2di)(__m128i)(i), \
4595 (__v2di)(__m128i)(mask), (s)))
4596
4597/// Conditionally gathers four 64-bit integer values, either from the
4598/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4599/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4600/// of [4 x i64] in \a mask determines the source for each element.
4601///
4602/// \code{.operation}
4603/// FOR element := 0 to 3
4604/// j := element*64
4605/// k := element*64
4606/// IF mask[j+63] == 0
4607/// result[j+63:j] := a[j+63:j]
4608/// ELSE
4609/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4610/// FI
4611/// ENDFOR
4612/// \endcode
4613///
4614/// \headerfile <immintrin.h>
4615///
4616/// \code
4617/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4618/// __m256i i, __m256i mask, const int s);
4619/// \endcode
4620///
4621/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4622///
4623/// \param a
4624/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4625/// zero.
4626/// \param m
4627/// A pointer to the memory used for loading values.
4628/// \param i
4629/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4630/// \param mask
4631/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4632/// bit of each element in the mask vector represents the mask bits. If a
4633/// mask bit is zero, the corresponding value from vector \a a is gathered;
4634/// otherwise the value is loaded from memory.
4635/// \param s
4636/// A literal constant scale factor for the indexes in \a i. Must be
4637/// 1, 2, 4, or 8.
4638/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4639#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4640 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4641 (long long const *)(m), \
4642 (__v4di)(__m256i)(i), \
4643 (__v4di)(__m256i)(mask), (s)))
4644
4645/// Gathers two 64-bit floating-point values from memory \a m using scaled
4646/// indexes from the 128-bit vector of [4 x i32] in \a i.
4647///
4648/// \code{.operation}
4649/// FOR element := 0 to 1
4650/// j := element*64
4651/// k := element*32
4652/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4653/// ENDFOR
4654/// \endcode
4655///
4656/// \headerfile <immintrin.h>
4657///
4658/// \code
4659/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4660/// \endcode
4661///
4662/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4663///
4664/// \param m
4665/// A pointer to the memory used for loading values.
4666/// \param i
4667/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4668/// the first two elements are used.
4669/// \param s
4670/// A literal constant scale factor for the indexes in \a i. Must be
4671/// 1, 2, 4, or 8.
4672/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4673#define _mm_i32gather_pd(m, i, s) \
4674 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4675 (double const *)(m), \
4676 (__v4si)(__m128i)(i), \
4677 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4678 _mm_setzero_pd()), \
4679 (s)))
4680
4681/// Gathers four 64-bit floating-point values from memory \a m using scaled
4682/// indexes from the 128-bit vector of [4 x i32] in \a i.
4683///
4684/// \code{.operation}
4685/// FOR element := 0 to 3
4686/// j := element*64
4687/// k := element*32
4688/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4689/// ENDFOR
4690/// \endcode
4691///
4692/// \headerfile <immintrin.h>
4693///
4694/// \code
4695/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4696/// \endcode
4697///
4698/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4699///
4700/// \param m
4701/// A pointer to the memory used for loading values.
4702/// \param i
4703/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4704/// \param s
4705/// A literal constant scale factor for the indexes in \a i. Must be
4706/// 1, 2, 4, or 8.
4707/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4708#define _mm256_i32gather_pd(m, i, s) \
4709 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4710 (double const *)(m), \
4711 (__v4si)(__m128i)(i), \
4712 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4713 _mm256_setzero_pd(), \
4714 _CMP_EQ_OQ), \
4715 (s)))
4716
4717/// Gathers two 64-bit floating-point values from memory \a m using scaled
4718/// indexes from the 128-bit vector of [2 x i64] in \a i.
4719///
4720/// \code{.operation}
4721/// FOR element := 0 to 1
4722/// j := element*64
4723/// k := element*64
4724/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4725/// ENDFOR
4726/// \endcode
4727///
4728/// \headerfile <immintrin.h>
4729///
4730/// \code
4731/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4732/// \endcode
4733///
4734/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4735///
4736/// \param m
4737/// A pointer to the memory used for loading values.
4738/// \param i
4739/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4740/// \param s
4741/// A literal constant scale factor for the indexes in \a i. Must be
4742/// 1, 2, 4, or 8.
4743/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4744#define _mm_i64gather_pd(m, i, s) \
4745 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4746 (double const *)(m), \
4747 (__v2di)(__m128i)(i), \
4748 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4749 _mm_setzero_pd()), \
4750 (s)))
4751
4752/// Gathers four 64-bit floating-point values from memory \a m using scaled
4753/// indexes from the 256-bit vector of [4 x i64] in \a i.
4754///
4755/// \code{.operation}
4756/// FOR element := 0 to 3
4757/// j := element*64
4758/// k := element*64
4759/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4760/// ENDFOR
4761/// \endcode
4762///
4763/// \headerfile <immintrin.h>
4764///
4765/// \code
4766/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4767/// \endcode
4768///
4769/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4770///
4771/// \param m
4772/// A pointer to the memory used for loading values.
4773/// \param i
4774/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4775/// \param s
4776/// A literal constant scale factor for the indexes in \a i. Must be
4777/// 1, 2, 4, or 8.
4778/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4779#define _mm256_i64gather_pd(m, i, s) \
4780 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4781 (double const *)(m), \
4782 (__v4di)(__m256i)(i), \
4783 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4784 _mm256_setzero_pd(), \
4785 _CMP_EQ_OQ), \
4786 (s)))
4787
4788/// Gathers four 32-bit floating-point values from memory \a m using scaled
4789/// indexes from the 128-bit vector of [4 x i32] in \a i.
4790///
4791/// \code{.operation}
4792/// FOR element := 0 to 3
4793/// j := element*32
4794/// k := element*32
4795/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4796/// ENDFOR
4797/// \endcode
4798///
4799/// \headerfile <immintrin.h>
4800///
4801/// \code
4802/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4803/// \endcode
4804///
4805/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4806///
4807/// \param m
4808/// A pointer to the memory used for loading values.
4809/// \param i
4810/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4811/// \param s
4812/// A literal constant scale factor for the indexes in \a i. Must be
4813/// 1, 2, 4, or 8.
4814/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4815#define _mm_i32gather_ps(m, i, s) \
4816 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4817 (float const *)(m), \
4818 (__v4si)(__m128i)(i), \
4819 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4820 _mm_setzero_ps()), \
4821 (s)))
4822
4823/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4824/// indexes from the 256-bit vector of [8 x i32] in \a i.
4825///
4826/// \code{.operation}
4827/// FOR element := 0 to 7
4828/// j := element*32
4829/// k := element*32
4830/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4831/// ENDFOR
4832/// \endcode
4833///
4834/// \headerfile <immintrin.h>
4835///
4836/// \code
4837/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4838/// \endcode
4839///
4840/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4841///
4842/// \param m
4843/// A pointer to the memory used for loading values.
4844/// \param i
4845/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4846/// \param s
4847/// A literal constant scale factor for the indexes in \a i. Must be
4848/// 1, 2, 4, or 8.
4849/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4850#define _mm256_i32gather_ps(m, i, s) \
4851 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4852 (float const *)(m), \
4853 (__v8si)(__m256i)(i), \
4854 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4855 _mm256_setzero_ps(), \
4856 _CMP_EQ_OQ), \
4857 (s)))
4858
4859/// Gathers two 32-bit floating-point values from memory \a m using scaled
4860/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4861/// elements of the result are zeroed.
4862///
4863/// \code{.operation}
4864/// FOR element := 0 to 1
4865/// j := element*32
4866/// k := element*64
4867/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4868/// ENDFOR
4869/// result[127:64] := 0
4870/// \endcode
4871///
4872/// \headerfile <immintrin.h>
4873///
4874/// \code
4875/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4876/// \endcode
4877///
4878/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4879///
4880/// \param m
4881/// A pointer to the memory used for loading values.
4882/// \param i
4883/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4884/// \param s
4885/// A literal constant scale factor for the indexes in \a i. Must be
4886/// 1, 2, 4, or 8.
4887/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4888#define _mm_i64gather_ps(m, i, s) \
4889 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4890 (float const *)(m), \
4891 (__v2di)(__m128i)(i), \
4892 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4893 _mm_setzero_ps()), \
4894 (s)))
4895
4896/// Gathers four 32-bit floating-point values from memory \a m using scaled
4897/// indexes from the 256-bit vector of [4 x i64] in \a i.
4898///
4899/// \code{.operation}
4900/// FOR element := 0 to 3
4901/// j := element*32
4902/// k := element*64
4903/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4904/// ENDFOR
4905/// \endcode
4906///
4907/// \headerfile <immintrin.h>
4908///
4909/// \code
4910/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4911/// \endcode
4912///
4913/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4914///
4915/// \param m
4916/// A pointer to the memory used for loading values.
4917/// \param i
4918/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4919/// \param s
4920/// A literal constant scale factor for the indexes in \a i. Must be
4921/// 1, 2, 4, or 8.
4922/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4923#define _mm256_i64gather_ps(m, i, s) \
4924 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
4925 (float const *)(m), \
4926 (__v4di)(__m256i)(i), \
4927 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4928 _mm_setzero_ps()), \
4929 (s)))
4930
4931/// Gathers four 32-bit floating-point values from memory \a m using scaled
4932/// indexes from the 128-bit vector of [4 x i32] in \a i.
4933///
4934/// \code{.operation}
4935/// FOR element := 0 to 3
4936/// j := element*32
4937/// k := element*32
4938/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4939/// ENDFOR
4940/// \endcode
4941///
4942/// \headerfile <immintrin.h>
4943///
4944/// \code
4945/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
4946/// \endcode
4947///
4948/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4949///
4950/// \param m
4951/// A pointer to the memory used for loading values.
4952/// \param i
4953/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4954/// \param s
4955/// A literal constant scale factor for the indexes in \a i. Must be
4956/// 1, 2, 4, or 8.
4957/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4958#define _mm_i32gather_epi32(m, i, s) \
4959 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
4960 (int const *)(m), (__v4si)(__m128i)(i), \
4961 (__v4si)_mm_set1_epi32(-1), (s)))
4962
4963/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4964/// indexes from the 256-bit vector of [8 x i32] in \a i.
4965///
4966/// \code{.operation}
4967/// FOR element := 0 to 7
4968/// j := element*32
4969/// k := element*32
4970/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4971/// ENDFOR
4972/// \endcode
4973///
4974/// \headerfile <immintrin.h>
4975///
4976/// \code
4977/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
4978/// \endcode
4979///
4980/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4981///
4982/// \param m
4983/// A pointer to the memory used for loading values.
4984/// \param i
4985/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4986/// \param s
4987/// A literal constant scale factor for the indexes in \a i. Must be
4988/// 1, 2, 4, or 8.
4989/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4990#define _mm256_i32gather_epi32(m, i, s) \
4991 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
4992 (int const *)(m), (__v8si)(__m256i)(i), \
4993 (__v8si)_mm256_set1_epi32(-1), (s)))
4994
4995/// Gathers two 32-bit integer values from memory \a m using scaled indexes
4996/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
4997/// of the result are zeroed.
4998///
4999/// \code{.operation}
5000/// FOR element := 0 to 1
5001/// j := element*32
5002/// k := element*64
5003/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5004/// ENDFOR
5005/// result[127:64] := 0
5006/// \endcode
5007///
5008/// \headerfile <immintrin.h>
5009///
5010/// \code
5011/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5012/// \endcode
5013///
5014/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5015///
5016/// \param m
5017/// A pointer to the memory used for loading values.
5018/// \param i
5019/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5020/// \param s
5021/// A literal constant scale factor for the indexes in \a i. Must be
5022/// 1, 2, 4, or 8.
5023/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5024#define _mm_i64gather_epi32(m, i, s) \
5025 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5026 (int const *)(m), (__v2di)(__m128i)(i), \
5027 (__v4si)_mm_set1_epi32(-1), (s)))
5028
5029/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5030/// from the 256-bit vector of [4 x i64] in \a i.
5031///
5032/// \code{.operation}
5033/// FOR element := 0 to 3
5034/// j := element*32
5035/// k := element*64
5036/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5037/// ENDFOR
5038/// \endcode
5039///
5040/// \headerfile <immintrin.h>
5041///
5042/// \code
5043/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5044/// \endcode
5045///
5046/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5047///
5048/// \param m
5049/// A pointer to the memory used for loading values.
5050/// \param i
5051/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5052/// \param s
5053/// A literal constant scale factor for the indexes in \a i. Must be
5054/// 1, 2, 4, or 8.
5055/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5056#define _mm256_i64gather_epi32(m, i, s) \
5057 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5058 (int const *)(m), (__v4di)(__m256i)(i), \
5059 (__v4si)_mm_set1_epi32(-1), (s)))
5060
5061/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5062/// from the 128-bit vector of [4 x i32] in \a i.
5063///
5064/// \code{.operation}
5065/// FOR element := 0 to 1
5066/// j := element*64
5067/// k := element*32
5068/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5069/// ENDFOR
5070/// \endcode
5071///
5072/// \headerfile <immintrin.h>
5073///
5074/// \code
5075/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5076/// \endcode
5077///
5078/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5079///
5080/// \param m
5081/// A pointer to the memory used for loading values.
5082/// \param i
5083/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5084/// the first two elements are used.
5085/// \param s
5086/// A literal constant scale factor for the indexes in \a i. Must be
5087/// 1, 2, 4, or 8.
5088/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5089#define _mm_i32gather_epi64(m, i, s) \
5090 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5091 (long long const *)(m), \
5092 (__v4si)(__m128i)(i), \
5093 (__v2di)_mm_set1_epi64x(-1), (s)))
5094
5095/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5096/// from the 128-bit vector of [4 x i32] in \a i.
5097///
5098/// \code{.operation}
5099/// FOR element := 0 to 3
5100/// j := element*64
5101/// k := element*32
5102/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5103/// ENDFOR
5104/// \endcode
5105///
5106/// \headerfile <immintrin.h>
5107///
5108/// \code
5109/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5110/// \endcode
5111///
5112/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5113///
5114/// \param m
5115/// A pointer to the memory used for loading values.
5116/// \param i
5117/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5118/// \param s
5119/// A literal constant scale factor for the indexes in \a i. Must be
5120/// 1, 2, 4, or 8.
5121/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5122#define _mm256_i32gather_epi64(m, i, s) \
5123 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5124 (long long const *)(m), \
5125 (__v4si)(__m128i)(i), \
5126 (__v4di)_mm256_set1_epi64x(-1), (s)))
5127
5128/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5129/// from the 128-bit vector of [2 x i64] in \a i.
5130///
5131/// \code{.operation}
5132/// FOR element := 0 to 1
5133/// j := element*64
5134/// k := element*64
5135/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5136/// ENDFOR
5137/// \endcode
5138///
5139/// \headerfile <immintrin.h>
5140///
5141/// \code
5142/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5143/// \endcode
5144///
5145/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5146///
5147/// \param m
5148/// A pointer to the memory used for loading values.
5149/// \param i
5150/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5151/// \param s
5152/// A literal constant scale factor for the indexes in \a i. Must be
5153/// 1, 2, 4, or 8.
5154/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5155#define _mm_i64gather_epi64(m, i, s) \
5156 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5157 (long long const *)(m), \
5158 (__v2di)(__m128i)(i), \
5159 (__v2di)_mm_set1_epi64x(-1), (s)))
5160
5161/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5162/// from the 256-bit vector of [4 x i64] in \a i.
5163///
5164/// \code{.operation}
5165/// FOR element := 0 to 3
5166/// j := element*64
5167/// k := element*64
5168/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5169/// ENDFOR
5170/// \endcode
5171///
5172/// \headerfile <immintrin.h>
5173///
5174/// \code
5175/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5176/// \endcode
5177///
5178/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5179///
5180/// \param m
5181/// A pointer to the memory used for loading values.
5182/// \param i
5183/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5184/// \param s
5185/// A literal constant scale factor for the indexes in \a i. Must be
5186/// 1, 2, 4, or 8.
5187/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5188#define _mm256_i64gather_epi64(m, i, s) \
5189 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5190 (long long const *)(m), \
5191 (__v4di)(__m256i)(i), \
5192 (__v4di)_mm256_set1_epi64x(-1), (s)))
5193
5194#undef __DEFAULT_FN_ATTRS256
5195#undef __DEFAULT_FN_ATTRS128
5196#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
5197#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5198
5199#endif /* __AVX2INTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
Definition altivec.h:578
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi32_epi64(__m128i __V)
Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ int __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_movemask_epi8(__m256i __a)
Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vecto...
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256(const void *__V)
Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vec...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsubs_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than an...
Definition avx2intrin.h:722
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epu32(__m256i __a, __m256i __b)
Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srav_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_andnot_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit int...
Definition avx2intrin.h:466
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation...
Definition avx2intrin.h:386
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maddubs_epi16(__m256i __a, __m256i __b)
Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed by...
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x f...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64(long long const *__X, __m256i __M)
Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to mem...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epu16(__m256i __a, __m256i __b)
Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upp...
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastsd_pd(__m128d __a)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32(int const *__X, __m256i __M)
Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi16(__m128i __V)
Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsi...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packus_epi32(__m256i __V1, __m256i __V2)
Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation,...
Definition avx2intrin.h:261
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi32(__m256i __a)
Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a a...
Definition avx2intrin.h:139
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srav_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi64(__m128i __V)
Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and r...
Definition avx2intrin.h:670
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed sa...
Definition avx2intrin.h:368
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits spec...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sign_epi32(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memor...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu32_epi64(__m128i __V)
Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi32(__m128i __V)
Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_shuffle_epi8(__m256i __a, __m256i __b)
Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi16(__m128i __V)
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the ...
Definition avx2intrin.h:551
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and r...
Definition avx2intrin.h:696
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32(int const *__X, __m128i __M)
Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsub_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and r...
Definition avx2intrin.h:938
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greate...
Definition avx2intrin.h:750
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsub_epi32(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and re...
Definition avx2intrin.h:969
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 b...
Definition avx2intrin.h:279
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi32(__m128i __V)
Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result'...
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadd_epi32(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and ret...
Definition avx2intrin.h:869
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater...
Definition avx2intrin.h:776
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi32(__m256i __a, __m256i __b)
Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned satur...
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using sign...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns t...
Definition avx2intrin.h:618
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sign_epi16(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in _...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi8(__m256i __a)
Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each va...
Definition avx2intrin.h:107
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi64(__m128i __V)
Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi64(__m128i __V)
Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadds_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using ...
Definition avx2intrin.h:903
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi64(__m256i __a, __m256i __b)
Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the ...
Definition avx2intrin.h:333
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi64(__m128i __V)
Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_and_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vectors in __a and __b.
Definition avx2intrin.h:448
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsi128_si256(__m128i __X)
Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi32(__m256i __a, __m256i __b)
Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the ...
Definition avx2intrin.h:315
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packus_epi16(__m256i __a, __m256i __b)
Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation,...
Definition avx2intrin.h:230
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi16(__m256i __a)
Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a ...
Definition avx2intrin.h:123
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the...
Definition avx2intrin.h:297
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_or_si256(__m256i __a, __m256i __b)
Computes the bitwise OR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsd_pd(__m128d __X)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi32(__m128i __V)
Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadd_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and re...
Definition avx2intrin.h:838
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu16(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16...
Definition avx2intrin.h:517
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_xor_si256(__m256i __a, __m256i __b)
Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sign_epi8(__m256i __a, __m256i __b)
Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi32(__m128i __V)
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned ...
Definition avx2intrin.h:403
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64(long long const *__X, __m128i __M)
Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the c...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi64(__m256i __a, __m256i __b)
Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_madd_epi16(__m256i __a, __m256i __b)
Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit interme...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturat...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packs_epi32(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation,...
Definition avx2intrin.h:200
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater...
Definition avx2intrin.h:802
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a, __m256i __b)
Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers fr...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation,...
Definition avx2intrin.h:351
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and ...
Definition avx2intrin.h:644
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packs_epi16(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation,...
Definition avx2intrin.h:169
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu8(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a a...
Definition avx2intrin.h:492
static __inline__ void int __a
Definition emmintrin.h:4077
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19