clang 22.0.0git
avx2intrin.h
Go to the documentation of this file.
1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX2INTRIN_H
15#define __AVX2INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
20 __min_vector_width__(256)))
21#define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
23 __min_vector_width__(128)))
24
25#if defined(__cplusplus) && (__cplusplus >= 201103L)
26#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
27#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
28#else
29#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
30#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
31#endif
32
33/* SSE4 Multiple Packed Sums of Absolute Difference. */
34/// Computes sixteen sum of absolute difference (SAD) operations on sets of
35/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
36/// \a Y.
37///
38/// Eight SAD results are computed using the lower half of the input
39/// vectors, and another eight using the upper half. These 16-bit values
40/// are returned in the lower and upper halves of the 256-bit result,
41/// respectively.
42///
43/// A single SAD operation selects four bytes from \a X and four bytes from
44/// \a Y as input. It computes the differences between each \a X byte and
45/// the corresponding \a Y byte, takes the absolute value of each
46/// difference, and sums these four values to form one 16-bit result. The
47/// intrinsic computes 16 of these results with different sets of input
48/// bytes.
49///
50/// For each set of eight results, the SAD operations use the same four
51/// bytes from \a Y; the starting bit position for these four bytes is
52/// specified by \a M[1:0] times 32. The eight operations use successive
53/// sets of four bytes from \a X; the starting bit position for the first
54/// set of four bytes is specified by \a M[2] times 32. These bit positions
55/// are all relative to the 128-bit lane for each set of eight operations.
56///
57/// \code{.operation}
58/// r := 0
59/// FOR i := 0 TO 1
60/// j := i*3
61/// Ybase := M[j+1:j]*32 + i*128
62/// Xbase := M[j+2]*32 + i*128
63/// FOR k := 0 TO 3
64/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
65/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
66/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
67/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
68/// result[r+15:r] := temp0 + temp1 + temp2 + temp3
69/// Xbase := Xbase + 8
70/// r := r + 16
71/// ENDFOR
72/// ENDFOR
73/// \endcode
74///
75/// \headerfile <immintrin.h>
76///
77/// \code
78/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
79/// \endcode
80///
81/// This intrinsic corresponds to the \c VMPSADBW instruction.
82///
83/// \param X
84/// A 256-bit integer vector containing one of the inputs.
85/// \param Y
86/// A 256-bit integer vector containing one of the inputs.
87/// \param M
88/// An unsigned immediate value specifying the starting positions of the
89/// bytes to operate on.
90/// \returns A 256-bit vector of [16 x i16] containing the result.
91#define _mm256_mpsadbw_epu8(X, Y, M) \
92 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
93 (__v32qi)(__m256i)(Y), (int)(M)))
94
95/// Computes the absolute value of each signed byte in the 256-bit integer
96/// vector \a __a and returns each value in the corresponding byte of
97/// the result.
98///
99/// \headerfile <immintrin.h>
100///
101/// This intrinsic corresponds to the \c VPABSB instruction.
102///
103/// \param __a
104/// A 256-bit integer vector.
105/// \returns A 256-bit integer vector containing the result.
106static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
108 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
109}
110
111/// Computes the absolute value of each signed 16-bit element in the 256-bit
112/// vector of [16 x i16] in \a __a and returns each value in the
113/// corresponding element of the result.
114///
115/// \headerfile <immintrin.h>
116///
117/// This intrinsic corresponds to the \c VPABSW instruction.
118///
119/// \param __a
120/// A 256-bit vector of [16 x i16].
121/// \returns A 256-bit vector of [16 x i16] containing the result.
122static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
124 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
125}
126
127/// Computes the absolute value of each signed 32-bit element in the 256-bit
128/// vector of [8 x i32] in \a __a and returns each value in the
129/// corresponding element of the result.
130///
131/// \headerfile <immintrin.h>
132///
133/// This intrinsic corresponds to the \c VPABSD instruction.
134///
135/// \param __a
136/// A 256-bit vector of [8 x i32].
137/// \returns A 256-bit vector of [8 x i32] containing the result.
138static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
140 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
141}
142
143/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
144/// integers using signed saturation, and returns the 256-bit result.
145///
146/// \code{.operation}
147/// FOR i := 0 TO 7
148/// j := i*16
149/// k := i*8
150/// result[7+k:k] := SATURATE8(__a[15+j:j])
151/// result[71+k:64+k] := SATURATE8(__b[15+j:j])
152/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
153/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
154/// ENDFOR
155/// \endcode
156///
157/// \headerfile <immintrin.h>
158///
159/// This intrinsic corresponds to the \c VPACKSSWB instruction.
160///
161/// \param __a
162/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
163/// result[191:128].
164/// \param __b
165/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
166/// result[255:192].
167/// \returns A 256-bit integer vector containing the result.
168static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
169_mm256_packs_epi16(__m256i __a, __m256i __b) {
170 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
171}
172
173/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
174/// integers using signed saturation, and returns the resulting 256-bit
175/// vector of [16 x i16].
176///
177/// \code{.operation}
178/// FOR i := 0 TO 3
179/// j := i*32
180/// k := i*16
181/// result[15+k:k] := SATURATE16(__a[31+j:j])
182/// result[79+k:64+k] := SATURATE16(__b[31+j:j])
183/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
184/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
185/// ENDFOR
186/// \endcode
187///
188/// \headerfile <immintrin.h>
189///
190/// This intrinsic corresponds to the \c VPACKSSDW instruction.
191///
192/// \param __a
193/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
194/// result[191:128].
195/// \param __b
196/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
197/// result[255:192].
198/// \returns A 256-bit vector of [16 x i16] containing the result.
199static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
200_mm256_packs_epi32(__m256i __a, __m256i __b) {
201 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
202}
203
204/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
205/// using unsigned saturation, and returns the 256-bit result.
206///
207/// \code{.operation}
208/// FOR i := 0 TO 7
209/// j := i*16
210/// k := i*8
211/// result[7+k:k] := SATURATE8U(__a[15+j:j])
212/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
213/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
214/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
215/// ENDFOR
216/// \endcode
217///
218/// \headerfile <immintrin.h>
219///
220/// This intrinsic corresponds to the \c VPACKUSWB instruction.
221///
222/// \param __a
223/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
224/// result[191:128].
225/// \param __b
226/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
227/// result[255:192].
228/// \returns A 256-bit integer vector containing the result.
229static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
230_mm256_packus_epi16(__m256i __a, __m256i __b) {
231 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
232}
233
234/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
235/// using unsigned saturation, and returns the resulting 256-bit vector of
236/// [16 x i16].
237///
238/// \code{.operation}
239/// FOR i := 0 TO 3
240/// j := i*32
241/// k := i*16
242/// result[15+k:k] := SATURATE16U(__V1[31+j:j])
243/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
244/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
245/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
246/// ENDFOR
247/// \endcode
248///
249/// \headerfile <immintrin.h>
250///
251/// This intrinsic corresponds to the \c VPACKUSDW instruction.
252///
253/// \param __V1
254/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
255/// result[191:128].
256/// \param __V2
257/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
258/// result[255:192].
259/// \returns A 256-bit vector of [16 x i16] containing the result.
260static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
261_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
262 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
263}
264
265/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
266/// vectors and returns the lower 8 bits of each sum in the corresponding
267/// byte of the 256-bit integer vector result (overflow is ignored).
268///
269/// \headerfile <immintrin.h>
270///
271/// This intrinsic corresponds to the \c VPADDB instruction.
272///
273/// \param __a
274/// A 256-bit integer vector containing one of the source operands.
275/// \param __b
276/// A 256-bit integer vector containing one of the source operands.
277/// \returns A 256-bit integer vector containing the sums.
278static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
279_mm256_add_epi8(__m256i __a, __m256i __b) {
280 return (__m256i)((__v32qu)__a + (__v32qu)__b);
281}
282
283/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284/// [16 x i16] and returns the lower 16 bits of each sum in the
285/// corresponding element of the [16 x i16] result (overflow is ignored).
286///
287/// \headerfile <immintrin.h>
288///
289/// This intrinsic corresponds to the \c VPADDW instruction.
290///
291/// \param __a
292/// A 256-bit vector of [16 x i16] containing one of the source operands.
293/// \param __b
294/// A 256-bit vector of [16 x i16] containing one of the source operands.
295/// \returns A 256-bit vector of [16 x i16] containing the sums.
296static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
297_mm256_add_epi16(__m256i __a, __m256i __b) {
298 return (__m256i)((__v16hu)__a + (__v16hu)__b);
299}
300
301/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
302/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
303/// element of the [8 x i32] result (overflow is ignored).
304///
305/// \headerfile <immintrin.h>
306///
307/// This intrinsic corresponds to the \c VPADDD instruction.
308///
309/// \param __a
310/// A 256-bit vector of [8 x i32] containing one of the source operands.
311/// \param __b
312/// A 256-bit vector of [8 x i32] containing one of the source operands.
313/// \returns A 256-bit vector of [8 x i32] containing the sums.
314static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
315_mm256_add_epi32(__m256i __a, __m256i __b) {
316 return (__m256i)((__v8su)__a + (__v8su)__b);
317}
318
319/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
320/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
321/// element of the [4 x i64] result (overflow is ignored).
322///
323/// \headerfile <immintrin.h>
324///
325/// This intrinsic corresponds to the \c VPADDQ instruction.
326///
327/// \param __a
328/// A 256-bit vector of [4 x i64] containing one of the source operands.
329/// \param __b
330/// A 256-bit vector of [4 x i64] containing one of the source operands.
331/// \returns A 256-bit vector of [4 x i64] containing the sums.
332static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
333_mm256_add_epi64(__m256i __a, __m256i __b) {
334 return (__m256i)((__v4du)__a + (__v4du)__b);
335}
336
337/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
338/// vectors using signed saturation, and returns each sum in the
339/// corresponding byte of the 256-bit integer vector result.
340///
341/// \headerfile <immintrin.h>
342///
343/// This intrinsic corresponds to the \c VPADDSB instruction.
344///
345/// \param __a
346/// A 256-bit integer vector containing one of the source operands.
347/// \param __b
348/// A 256-bit integer vector containing one of the source operands.
349/// \returns A 256-bit integer vector containing the sums.
350static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
351_mm256_adds_epi8(__m256i __a, __m256i __b) {
352 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
353}
354
355/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
356/// [16 x i16] using signed saturation, and returns the [16 x i16] result.
357///
358/// \headerfile <immintrin.h>
359///
360/// This intrinsic corresponds to the \c VPADDSW instruction.
361///
362/// \param __a
363/// A 256-bit vector of [16 x i16] containing one of the source operands.
364/// \param __b
365/// A 256-bit vector of [16 x i16] containing one of the source operands.
366/// \returns A 256-bit vector of [16 x i16] containing the sums.
367static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
368_mm256_adds_epi16(__m256i __a, __m256i __b) {
369 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
370}
371
372/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
373/// vectors using unsigned saturation, and returns each sum in the
374/// corresponding byte of the 256-bit integer vector result.
375///
376/// \headerfile <immintrin.h>
377///
378/// This intrinsic corresponds to the \c VPADDUSB instruction.
379///
380/// \param __a
381/// A 256-bit integer vector containing one of the source operands.
382/// \param __b
383/// A 256-bit integer vector containing one of the source operands.
384/// \returns A 256-bit integer vector containing the sums.
385static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
386_mm256_adds_epu8(__m256i __a, __m256i __b) {
387 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
388}
389
390/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
391/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
392///
393/// \headerfile <immintrin.h>
394///
395/// This intrinsic corresponds to the \c VPADDUSW instruction.
396///
397/// \param __a
398/// A 256-bit vector of [16 x i16] containing one of the source operands.
399/// \param __b
400/// A 256-bit vector of [16 x i16] containing one of the source operands.
401/// \returns A 256-bit vector of [16 x i16] containing the sums.
402static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
403_mm256_adds_epu16(__m256i __a, __m256i __b) {
404 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
405}
406
407/// Uses the lower half of the 256-bit vector \a a as the upper half of a
408/// temporary 256-bit value, and the lower half of the 256-bit vector \a b
409/// as the lower half of the temporary value. Right-shifts the temporary
410/// value by \a n bytes, and uses the lower 16 bytes of the shifted value
411/// as the lower 16 bytes of the result. Uses the upper halves of \a a and
412/// \a b to make another temporary value, right shifts by \a n, and uses
413/// the lower 16 bytes of the shifted value as the upper 16 bytes of the
414/// result.
415///
416/// \headerfile <immintrin.h>
417///
418/// \code
419/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
420/// \endcode
421///
422/// This intrinsic corresponds to the \c VPALIGNR instruction.
423///
424/// \param a
425/// A 256-bit integer vector containing source values.
426/// \param b
427/// A 256-bit integer vector containing source values.
428/// \param n
429/// An immediate value specifying the number of bytes to shift.
430/// \returns A 256-bit integer vector containing the result.
431#define _mm256_alignr_epi8(a, b, n) \
432 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
433 (__v32qi)(__m256i)(b), (n)))
434
435/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
436/// \a __b.
437///
438/// \headerfile <immintrin.h>
439///
440/// This intrinsic corresponds to the \c VPAND instruction.
441///
442/// \param __a
443/// A 256-bit integer vector.
444/// \param __b
445/// A 256-bit integer vector.
446/// \returns A 256-bit integer vector containing the result.
447static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
448_mm256_and_si256(__m256i __a, __m256i __b)
449{
450 return (__m256i)((__v4du)__a & (__v4du)__b);
451}
452
453/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
454/// the bitwise NOT of the 256-bit integer vector in \a __a.
455///
456/// \headerfile <immintrin.h>
457///
458/// This intrinsic corresponds to the \c VPANDN instruction.
459///
460/// \param __a
461/// A 256-bit integer vector.
462/// \param __b
463/// A 256-bit integer vector.
464/// \returns A 256-bit integer vector containing the result.
465static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
466_mm256_andnot_si256(__m256i __a, __m256i __b)
467{
468 return (__m256i)(~(__v4du)__a & (__v4du)__b);
469}
470
471/// Computes the averages of the corresponding unsigned bytes in the two
472/// 256-bit integer vectors in \a __a and \a __b and returns each
473/// average in the corresponding byte of the 256-bit result.
474///
475/// \code{.operation}
476/// FOR i := 0 TO 31
477/// j := i*8
478/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
479/// ENDFOR
480/// \endcode
481///
482/// \headerfile <immintrin.h>
483///
484/// This intrinsic corresponds to the \c VPAVGB instruction.
485///
486/// \param __a
487/// A 256-bit integer vector.
488/// \param __b
489/// A 256-bit integer vector.
490/// \returns A 256-bit integer vector containing the result.
491static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
492_mm256_avg_epu8(__m256i __a, __m256i __b) {
493 return (__m256i)__builtin_ia32_pavgb256((__v32qu)__a, (__v32qu)__b);
494}
495
496/// Computes the averages of the corresponding unsigned 16-bit integers in
497/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
498/// each average in the corresponding element of the 256-bit result.
499///
500/// \code{.operation}
501/// FOR i := 0 TO 15
502/// j := i*16
503/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
504/// ENDFOR
505/// \endcode
506///
507/// \headerfile <immintrin.h>
508///
509/// This intrinsic corresponds to the \c VPAVGW instruction.
510///
511/// \param __a
512/// A 256-bit vector of [16 x i16].
513/// \param __b
514/// A 256-bit vector of [16 x i16].
515/// \returns A 256-bit vector of [16 x i16] containing the result.
516static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
517_mm256_avg_epu16(__m256i __a, __m256i __b) {
518 return (__m256i)__builtin_ia32_pavgw256((__v16hu)__a, (__v16hu)__b);
519}
520
521/// Merges 8-bit integer values from either of the two 256-bit vectors
522/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
523/// the resulting 256-bit integer vector.
524///
525/// \code{.operation}
526/// FOR i := 0 TO 31
527/// j := i*8
528/// IF __M[7+i] == 0
529/// result[7+j:j] := __V1[7+j:j]
530/// ELSE
531/// result[7+j:j] := __V2[7+j:j]
532/// FI
533/// ENDFOR
534/// \endcode
535///
536/// \headerfile <immintrin.h>
537///
538/// This intrinsic corresponds to the \c VPBLENDVB instruction.
539///
540/// \param __V1
541/// A 256-bit integer vector containing source values.
542/// \param __V2
543/// A 256-bit integer vector containing source values.
544/// \param __M
545/// A 256-bit integer vector, with bit [7] of each byte specifying the
546/// source for each corresponding byte of the result. When the mask bit
547/// is 0, the byte is copied from \a __V1; otherwise, it is copied from
548/// \a __V2.
549/// \returns A 256-bit integer vector containing the result.
550static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
551_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) {
552 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
553 (__v32qi)__M);
554}
555
556/// Merges 16-bit integer values from either of the two 256-bit vectors
557/// \a V1 or \a V2, as specified by the immediate integer operand \a M,
558/// and returns the resulting 256-bit vector of [16 x i16].
559///
560/// \code{.operation}
561/// FOR i := 0 TO 7
562/// j := i*16
563/// IF M[i] == 0
564/// result[7+j:j] := V1[7+j:j]
565/// result[135+j:128+j] := V1[135+j:128+j]
566/// ELSE
567/// result[7+j:j] := V2[7+j:j]
568/// result[135+j:128+j] := V2[135+j:128+j]
569/// FI
570/// ENDFOR
571/// \endcode
572///
573/// \headerfile <immintrin.h>
574///
575/// \code
576/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
577/// \endcode
578///
579/// This intrinsic corresponds to the \c VPBLENDW instruction.
580///
581/// \param V1
582/// A 256-bit vector of [16 x i16] containing source values.
583/// \param V2
584/// A 256-bit vector of [16 x i16] containing source values.
585/// \param M
586/// An immediate 8-bit integer operand, with bits [7:0] specifying the
587/// source for each element of the result. The position of the mask bit
588/// corresponds to the index of a copied value. When a mask bit is 0, the
589/// element is copied from \a V1; otherwise, it is copied from \a V2.
590/// \a M[0] determines the source for elements 0 and 8, \a M[1] for
591/// elements 1 and 9, and so forth.
592/// \returns A 256-bit vector of [16 x i16] containing the result.
593#define _mm256_blend_epi16(V1, V2, M) \
594 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
595 (__v16hi)(__m256i)(V2), (int)(M)))
596
597/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
598/// \a __b for equality and returns the outcomes in the corresponding
599/// bytes of the 256-bit result.
600///
601/// \code{.operation}
602/// FOR i := 0 TO 31
603/// j := i*8
604/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
605/// ENDFOR
606/// \endcode
607///
608/// \headerfile <immintrin.h>
609///
610/// This intrinsic corresponds to the \c VPCMPEQB instruction.
611///
612/// \param __a
613/// A 256-bit integer vector containing one of the inputs.
614/// \param __b
615/// A 256-bit integer vector containing one of the inputs.
616/// \returns A 256-bit integer vector containing the result.
617static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
618_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
619{
620 return (__m256i)((__v32qi)__a == (__v32qi)__b);
621}
622
623/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
624/// \a __a and \a __b for equality and returns the outcomes in the
625/// corresponding elements of the 256-bit result.
626///
627/// \code{.operation}
628/// FOR i := 0 TO 15
629/// j := i*16
630/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
631/// ENDFOR
632/// \endcode
633///
634/// \headerfile <immintrin.h>
635///
636/// This intrinsic corresponds to the \c VPCMPEQW instruction.
637///
638/// \param __a
639/// A 256-bit vector of [16 x i16] containing one of the inputs.
640/// \param __b
641/// A 256-bit vector of [16 x i16] containing one of the inputs.
642/// \returns A 256-bit vector of [16 x i16] containing the result.
643static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
644_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
645{
646 return (__m256i)((__v16hi)__a == (__v16hi)__b);
647}
648
649/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
650/// \a __a and \a __b for equality and returns the outcomes in the
651/// corresponding elements of the 256-bit result.
652///
653/// \code{.operation}
654/// FOR i := 0 TO 7
655/// j := i*32
656/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
657/// ENDFOR
658/// \endcode
659///
660/// \headerfile <immintrin.h>
661///
662/// This intrinsic corresponds to the \c VPCMPEQD instruction.
663///
664/// \param __a
665/// A 256-bit vector of [8 x i32] containing one of the inputs.
666/// \param __b
667/// A 256-bit vector of [8 x i32] containing one of the inputs.
668/// \returns A 256-bit vector of [8 x i32] containing the result.
669static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
670_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
671{
672 return (__m256i)((__v8si)__a == (__v8si)__b);
673}
674
675/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
676/// \a __a and \a __b for equality and returns the outcomes in the
677/// corresponding elements of the 256-bit result.
678///
679/// \code{.operation}
680/// FOR i := 0 TO 3
681/// j := i*64
682/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
683/// ENDFOR
684/// \endcode
685///
686/// \headerfile <immintrin.h>
687///
688/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
689///
690/// \param __a
691/// A 256-bit vector of [4 x i64] containing one of the inputs.
692/// \param __b
693/// A 256-bit vector of [4 x i64] containing one of the inputs.
694/// \returns A 256-bit vector of [4 x i64] containing the result.
695static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
696_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
697{
698 return (__m256i)((__v4di)__a == (__v4di)__b);
699}
700
701/// Compares corresponding signed bytes in the 256-bit integer vectors in
702/// \a __a and \a __b for greater-than and returns the outcomes in the
703/// corresponding bytes of the 256-bit result.
704///
705/// \code{.operation}
706/// FOR i := 0 TO 31
707/// j := i*8
708/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
709/// ENDFOR
710/// \endcode
711///
712/// \headerfile <immintrin.h>
713///
714/// This intrinsic corresponds to the \c VPCMPGTB instruction.
715///
716/// \param __a
717/// A 256-bit integer vector containing one of the inputs.
718/// \param __b
719/// A 256-bit integer vector containing one of the inputs.
720/// \returns A 256-bit integer vector containing the result.
721static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
722_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
723{
724 /* This function always performs a signed comparison, but __v32qi is a char
725 which may be signed or unsigned, so use __v32qs. */
726 return (__m256i)((__v32qs)__a > (__v32qs)__b);
727}
728
729/// Compares corresponding signed elements in the 256-bit vectors of
730/// [16 x i16] in \a __a and \a __b for greater-than and returns the
731/// outcomes in the corresponding elements of the 256-bit result.
732///
733/// \code{.operation}
734/// FOR i := 0 TO 15
735/// j := i*16
736/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
737/// ENDFOR
738/// \endcode
739///
740/// \headerfile <immintrin.h>
741///
742/// This intrinsic corresponds to the \c VPCMPGTW instruction.
743///
744/// \param __a
745/// A 256-bit vector of [16 x i16] containing one of the inputs.
746/// \param __b
747/// A 256-bit vector of [16 x i16] containing one of the inputs.
748/// \returns A 256-bit vector of [16 x i16] containing the result.
749static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
750_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
751{
752 return (__m256i)((__v16hi)__a > (__v16hi)__b);
753}
754
755/// Compares corresponding signed elements in the 256-bit vectors of
756/// [8 x i32] in \a __a and \a __b for greater-than and returns the
757/// outcomes in the corresponding elements of the 256-bit result.
758///
759/// \code{.operation}
760/// FOR i := 0 TO 7
761/// j := i*32
762/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
763/// ENDFOR
764/// \endcode
765///
766/// \headerfile <immintrin.h>
767///
768/// This intrinsic corresponds to the \c VPCMPGTD instruction.
769///
770/// \param __a
771/// A 256-bit vector of [8 x i32] containing one of the inputs.
772/// \param __b
773/// A 256-bit vector of [8 x i32] containing one of the inputs.
774/// \returns A 256-bit vector of [8 x i32] containing the result.
775static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
776_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
777{
778 return (__m256i)((__v8si)__a > (__v8si)__b);
779}
780
781/// Compares corresponding signed elements in the 256-bit vectors of
782/// [4 x i64] in \a __a and \a __b for greater-than and returns the
783/// outcomes in the corresponding elements of the 256-bit result.
784///
785/// \code{.operation}
786/// FOR i := 0 TO 3
787/// j := i*64
788/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
789/// ENDFOR
790/// \endcode
791///
792/// \headerfile <immintrin.h>
793///
794/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
795///
796/// \param __a
797/// A 256-bit vector of [4 x i64] containing one of the inputs.
798/// \param __b
799/// A 256-bit vector of [4 x i64] containing one of the inputs.
800/// \returns A 256-bit vector of [4 x i64] containing the result.
801static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
802_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
803{
804 return (__m256i)((__v4di)__a > (__v4di)__b);
805}
806
807/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
808/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
809/// element of the [16 x i16] result (overflow is ignored). Sums from
810/// \a __a are returned in the lower 64 bits of each 128-bit half of the
811/// result; sums from \a __b are returned in the upper 64 bits of each
812/// 128-bit half of the result.
813///
814/// \code{.operation}
815/// FOR i := 0 TO 1
816/// j := i*128
817/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
818/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
819/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
820/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
821/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
822/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
823/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
824/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
825/// ENDFOR
826/// \endcode
827///
828/// \headerfile <immintrin.h>
829///
830/// This intrinsic corresponds to the \c VPHADDW instruction.
831///
832/// \param __a
833/// A 256-bit vector of [16 x i16] containing one of the source operands.
834/// \param __b
835/// A 256-bit vector of [16 x i16] containing one of the source operands.
836/// \returns A 256-bit vector of [16 x i16] containing the sums.
837static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
838_mm256_hadd_epi16(__m256i __a, __m256i __b) {
839 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
840}
841
842/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
843/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
844/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
845/// are returned in the lower 64 bits of each 128-bit half of the result;
846/// sums from \a __b are returned in the upper 64 bits of each 128-bit half
847/// of the result.
848///
849/// \code{.operation}
850/// FOR i := 0 TO 1
851/// j := i*128
852/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
853/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
854/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
855/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
856/// ENDFOR
857/// \endcode
858///
859/// \headerfile <immintrin.h>
860///
861/// This intrinsic corresponds to the \c VPHADDD instruction.
862///
863/// \param __a
864/// A 256-bit vector of [8 x i32] containing one of the source operands.
865/// \param __b
866/// A 256-bit vector of [8 x i32] containing one of the source operands.
867/// \returns A 256-bit vector of [8 x i32] containing the sums.
868static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
869_mm256_hadd_epi32(__m256i __a, __m256i __b) {
870 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
871}
872
873/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
874/// vectors of [16 x i16] using signed saturation and returns each sum in
875/// an element of the [16 x i16] result. Sums from \a __a are returned in
876/// the lower 64 bits of each 128-bit half of the result; sums from \a __b
877/// are returned in the upper 64 bits of each 128-bit half of the result.
878///
879/// \code{.operation}
880/// FOR i := 0 TO 1
881/// j := i*128
882/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
883/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
884/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
885/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
886/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
887/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
888/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
889/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
890/// ENDFOR
891/// \endcode
892///
893/// \headerfile <immintrin.h>
894///
895/// This intrinsic corresponds to the \c VPHADDSW instruction.
896///
897/// \param __a
898/// A 256-bit vector of [16 x i16] containing one of the source operands.
899/// \param __b
900/// A 256-bit vector of [16 x i16] containing one of the source operands.
901/// \returns A 256-bit vector of [16 x i16] containing the sums.
902static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
903_mm256_hadds_epi16(__m256i __a, __m256i __b) {
904 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
905}
906
907/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
908/// vectors of [16 x i16] and returns the lower 16 bits of each difference
909/// in an element of the [16 x i16] result (overflow is ignored).
910/// Differences from \a __a are returned in the lower 64 bits of each
911/// 128-bit half of the result; differences from \a __b are returned in the
912/// upper 64 bits of each 128-bit half of the result.
913///
914/// \code{.operation}
915/// FOR i := 0 TO 1
916/// j := i*128
917/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
918/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
919/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
920/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
921/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
922/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
923/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
924/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
925/// ENDFOR
926/// \endcode
927///
928/// \headerfile <immintrin.h>
929///
930/// This intrinsic corresponds to the \c VPHSUBW instruction.
931///
932/// \param __a
933/// A 256-bit vector of [16 x i16] containing one of the source operands.
934/// \param __b
935/// A 256-bit vector of [16 x i16] containing one of the source operands.
936/// \returns A 256-bit vector of [16 x i16] containing the differences.
937static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
938_mm256_hsub_epi16(__m256i __a, __m256i __b) {
939 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
940}
941
942/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
943/// vectors of [8 x i32] and returns the lower 32 bits of each difference in
944/// an element of the [8 x i32] result (overflow is ignored). Differences
945/// from \a __a are returned in the lower 64 bits of each 128-bit half of
946/// the result; differences from \a __b are returned in the upper 64 bits
947/// of each 128-bit half of the result.
948///
949/// \code{.operation}
950/// FOR i := 0 TO 1
951/// j := i*128
952/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
953/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
954/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
955/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
956/// ENDFOR
957/// \endcode
958///
959/// \headerfile <immintrin.h>
960///
961/// This intrinsic corresponds to the \c VPHSUBD instruction.
962///
963/// \param __a
964/// A 256-bit vector of [8 x i32] containing one of the source operands.
965/// \param __b
966/// A 256-bit vector of [8 x i32] containing one of the source operands.
967/// \returns A 256-bit vector of [8 x i32] containing the differences.
968static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
969_mm256_hsub_epi32(__m256i __a, __m256i __b) {
970 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
971}
972
973/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
974/// vectors of [16 x i16] using signed saturation and returns each sum in
975/// an element of the [16 x i16] result. Differences from \a __a are
976/// returned in the lower 64 bits of each 128-bit half of the result;
977/// differences from \a __b are returned in the upper 64 bits of each
978/// 128-bit half of the result.
979///
980/// \code{.operation}
981/// FOR i := 0 TO 1
982/// j := i*128
983/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
984/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
985/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
986/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
987/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
988/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
989/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
990/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
991/// ENDFOR
992/// \endcode
993///
994/// \headerfile <immintrin.h>
995///
996/// This intrinsic corresponds to the \c VPHSUBSW instruction.
997///
998/// \param __a
999/// A 256-bit vector of [16 x i16] containing one of the source operands.
1000/// \param __b
1001/// A 256-bit vector of [16 x i16] containing one of the source operands.
1002/// \returns A 256-bit vector of [16 x i16] containing the differences.
1003static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1004_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
1005 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1006}
1007
1008/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1009/// with the corresponding signed byte from the 256-bit integer vector in
1010/// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1011/// pairs of those products using signed saturation to form 16-bit sums
1012/// returned as elements of the [16 x i16] result.
1013///
1014/// \code{.operation}
1015/// FOR i := 0 TO 15
1016/// j := i*16
1017/// temp1 := __a[j+7:j] * __b[j+7:j]
1018/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1019/// result[j+15:j] := SATURATE16(temp1 + temp2)
1020/// ENDFOR
1021/// \endcode
1022///
1023/// \headerfile <immintrin.h>
1024///
1025/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1026///
1027/// \param __a
1028/// A 256-bit vector containing one of the source operands.
1029/// \param __b
1030/// A 256-bit vector containing one of the source operands.
1031/// \returns A 256-bit vector of [16 x i16] containing the result.
1032static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1033_mm256_maddubs_epi16(__m256i __a, __m256i __b) {
1034 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1035}
1036
1037/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1038/// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1039/// those products to form 32-bit sums returned as elements of the
1040/// [8 x i32] result.
1041///
1042/// There is only one wraparound case: when all four of the 16-bit sources
1043/// are \c 0x8000, the result will be \c 0x80000000.
1044///
1045/// \code{.operation}
1046/// FOR i := 0 TO 7
1047/// j := i*32
1048/// temp1 := __a[j+15:j] * __b[j+15:j]
1049/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1050/// result[j+31:j] := temp1 + temp2
1051/// ENDFOR
1052/// \endcode
1053///
1054/// \headerfile <immintrin.h>
1055///
1056/// This intrinsic corresponds to the \c VPMADDWD instruction.
1057///
1058/// \param __a
1059/// A 256-bit vector of [16 x i16] containing one of the source operands.
1060/// \param __b
1061/// A 256-bit vector of [16 x i16] containing one of the source operands.
1062/// \returns A 256-bit vector of [8 x i32] containing the result.
1063static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1064_mm256_madd_epi16(__m256i __a, __m256i __b) {
1065 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1066}
1067
1068/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1069/// in \a __a and \a __b and returns the larger of each pair in the
1070/// corresponding byte of the 256-bit result.
1071///
1072/// \headerfile <immintrin.h>
1073///
1074/// This intrinsic corresponds to the \c VPMAXSB instruction.
1075///
1076/// \param __a
1077/// A 256-bit integer vector.
1078/// \param __b
1079/// A 256-bit integer vector.
1080/// \returns A 256-bit integer vector containing the result.
1081static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1082_mm256_max_epi8(__m256i __a, __m256i __b) {
1083 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1084}
1085
1086/// Compares the corresponding signed 16-bit integers in the two 256-bit
1087/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1088/// each pair in the corresponding element of the 256-bit result.
1089///
1090/// \headerfile <immintrin.h>
1091///
1092/// This intrinsic corresponds to the \c VPMAXSW instruction.
1093///
1094/// \param __a
1095/// A 256-bit vector of [16 x i16].
1096/// \param __b
1097/// A 256-bit vector of [16 x i16].
1098/// \returns A 256-bit vector of [16 x i16] containing the result.
1099static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1100_mm256_max_epi16(__m256i __a, __m256i __b) {
1101 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1102}
1103
1104/// Compares the corresponding signed 32-bit integers in the two 256-bit
1105/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1106/// each pair in the corresponding element of the 256-bit result.
1107///
1108/// \headerfile <immintrin.h>
1109///
1110/// This intrinsic corresponds to the \c VPMAXSD instruction.
1111///
1112/// \param __a
1113/// A 256-bit vector of [8 x i32].
1114/// \param __b
1115/// A 256-bit vector of [8 x i32].
1116/// \returns A 256-bit vector of [8 x i32] containing the result.
1117static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1118_mm256_max_epi32(__m256i __a, __m256i __b) {
1119 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1120}
1121
1122/// Compares the corresponding unsigned bytes in the two 256-bit integer
1123/// vectors in \a __a and \a __b and returns the larger of each pair in
1124/// the corresponding byte of the 256-bit result.
1125///
1126/// \headerfile <immintrin.h>
1127///
1128/// This intrinsic corresponds to the \c VPMAXUB instruction.
1129///
1130/// \param __a
1131/// A 256-bit integer vector.
1132/// \param __b
1133/// A 256-bit integer vector.
1134/// \returns A 256-bit integer vector containing the result.
1135static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1136_mm256_max_epu8(__m256i __a, __m256i __b) {
1137 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1138}
1139
1140/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1141/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1142/// each pair in the corresponding element of the 256-bit result.
1143///
1144/// \headerfile <immintrin.h>
1145///
1146/// This intrinsic corresponds to the \c VPMAXUW instruction.
1147///
1148/// \param __a
1149/// A 256-bit vector of [16 x i16].
1150/// \param __b
1151/// A 256-bit vector of [16 x i16].
1152/// \returns A 256-bit vector of [16 x i16] containing the result.
1153static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1154_mm256_max_epu16(__m256i __a, __m256i __b) {
1155 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1156}
1157
1158/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1159/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1160/// each pair in the corresponding element of the 256-bit result.
1161///
1162/// \headerfile <immintrin.h>
1163///
1164/// This intrinsic corresponds to the \c VPMAXUD instruction.
1165///
1166/// \param __a
1167/// A 256-bit vector of [8 x i32].
1168/// \param __b
1169/// A 256-bit vector of [8 x i32].
1170/// \returns A 256-bit vector of [8 x i32] containing the result.
1171static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1172_mm256_max_epu32(__m256i __a, __m256i __b) {
1173 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1174}
1175
1176/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1177/// in \a __a and \a __b and returns the smaller of each pair in the
1178/// corresponding byte of the 256-bit result.
1179///
1180/// \headerfile <immintrin.h>
1181///
1182/// This intrinsic corresponds to the \c VPMINSB instruction.
1183///
1184/// \param __a
1185/// A 256-bit integer vector.
1186/// \param __b
1187/// A 256-bit integer vector.
1188/// \returns A 256-bit integer vector containing the result.
1189static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1190_mm256_min_epi8(__m256i __a, __m256i __b) {
1191 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1192}
1193
1194/// Compares the corresponding signed 16-bit integers in the two 256-bit
1195/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1196/// each pair in the corresponding element of the 256-bit result.
1197///
1198/// \headerfile <immintrin.h>
1199///
1200/// This intrinsic corresponds to the \c VPMINSW instruction.
1201///
1202/// \param __a
1203/// A 256-bit vector of [16 x i16].
1204/// \param __b
1205/// A 256-bit vector of [16 x i16].
1206/// \returns A 256-bit vector of [16 x i16] containing the result.
1207static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1208_mm256_min_epi16(__m256i __a, __m256i __b) {
1209 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1210}
1211
1212/// Compares the corresponding signed 32-bit integers in the two 256-bit
1213/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1214/// each pair in the corresponding element of the 256-bit result.
1215///
1216/// \headerfile <immintrin.h>
1217///
1218/// This intrinsic corresponds to the \c VPMINSD instruction.
1219///
1220/// \param __a
1221/// A 256-bit vector of [8 x i32].
1222/// \param __b
1223/// A 256-bit vector of [8 x i32].
1224/// \returns A 256-bit vector of [8 x i32] containing the result.
1225static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1226_mm256_min_epi32(__m256i __a, __m256i __b) {
1227 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1228}
1229
1230/// Compares the corresponding unsigned bytes in the two 256-bit integer
1231/// vectors in \a __a and \a __b and returns the smaller of each pair in
1232/// the corresponding byte of the 256-bit result.
1233///
1234/// \headerfile <immintrin.h>
1235///
1236/// This intrinsic corresponds to the \c VPMINUB instruction.
1237///
1238/// \param __a
1239/// A 256-bit integer vector.
1240/// \param __b
1241/// A 256-bit integer vector.
1242/// \returns A 256-bit integer vector containing the result.
1243static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1244_mm256_min_epu8(__m256i __a, __m256i __b) {
1245 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1246}
1247
1248/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1249/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1250/// each pair in the corresponding element of the 256-bit result.
1251///
1252/// \headerfile <immintrin.h>
1253///
1254/// This intrinsic corresponds to the \c VPMINUW instruction.
1255///
1256/// \param __a
1257/// A 256-bit vector of [16 x i16].
1258/// \param __b
1259/// A 256-bit vector of [16 x i16].
1260/// \returns A 256-bit vector of [16 x i16] containing the result.
1261static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1262_mm256_min_epu16(__m256i __a, __m256i __b) {
1263 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1264}
1265
1266/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1267/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1268/// each pair in the corresponding element of the 256-bit result.
1269///
1270/// \headerfile <immintrin.h>
1271///
1272/// This intrinsic corresponds to the \c VPMINUD instruction.
1273///
1274/// \param __a
1275/// A 256-bit vector of [8 x i32].
1276/// \param __b
1277/// A 256-bit vector of [8 x i32].
1278/// \returns A 256-bit vector of [8 x i32] containing the result.
1279static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1280_mm256_min_epu32(__m256i __a, __m256i __b) {
1281 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1282}
1283
1284/// Creates a 32-bit integer mask from the most significant bit of each byte
1285/// in the 256-bit integer vector in \a __a and returns the result.
1286///
1287/// \code{.operation}
1288/// FOR i := 0 TO 31
1289/// j := i*8
1290/// result[i] := __a[j+7]
1291/// ENDFOR
1292/// \endcode
1293///
1294/// \headerfile <immintrin.h>
1295///
1296/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1297///
1298/// \param __a
1299/// A 256-bit integer vector containing the source bytes.
1300/// \returns The 32-bit integer mask.
1301static __inline__ int __DEFAULT_FN_ATTRS256_CONSTEXPR
1303 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1304}
1305
1306/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1307/// the 16-bit values in the corresponding elements of a 256-bit vector
1308/// of [16 x i16].
1309///
1310/// \code{.operation}
1311/// FOR i := 0 TO 15
1312/// j := i*8
1313/// k := i*16
1314/// result[k+15:k] := SignExtend(__V[j+7:j])
1315/// ENDFOR
1316/// \endcode
1317///
1318/// \headerfile <immintrin.h>
1319///
1320/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1321///
1322/// \param __V
1323/// A 128-bit integer vector containing the source bytes.
1324/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1325/// values.
1326static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1328 /* This function always performs a signed extension, but __v16qi is a char
1329 which may be signed or unsigned, so use __v16qs. */
1330 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1331}
1332
1333/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1334/// \a __V and returns the 32-bit values in the corresponding elements of a
1335/// 256-bit vector of [8 x i32].
1336///
1337/// \code{.operation}
1338/// FOR i := 0 TO 7
1339/// j := i*8
1340/// k := i*32
1341/// result[k+31:k] := SignExtend(__V[j+7:j])
1342/// ENDFOR
1343/// \endcode
1344///
1345/// \headerfile <immintrin.h>
1346///
1347/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1348///
1349/// \param __V
1350/// A 128-bit integer vector containing the source bytes.
1351/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1352/// values.
1353static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1355 /* This function always performs a signed extension, but __v16qi is a char
1356 which may be signed or unsigned, so use __v16qs. */
1357 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1358}
1359
1360/// Sign-extends the first four bytes from the 128-bit integer vector in
1361/// \a __V and returns the 64-bit values in the corresponding elements of a
1362/// 256-bit vector of [4 x i64].
1363///
1364/// \code{.operation}
1365/// result[63:0] := SignExtend(__V[7:0])
1366/// result[127:64] := SignExtend(__V[15:8])
1367/// result[191:128] := SignExtend(__V[23:16])
1368/// result[255:192] := SignExtend(__V[31:24])
1369/// \endcode
1370///
1371/// \headerfile <immintrin.h>
1372///
1373/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1374///
1375/// \param __V
1376/// A 128-bit integer vector containing the source bytes.
1377/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1378/// values.
1379static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1381 /* This function always performs a signed extension, but __v16qi is a char
1382 which may be signed or unsigned, so use __v16qs. */
1383 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1384}
1385
1386/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1387/// \a __V and returns the 32-bit values in the corresponding elements of a
1388/// 256-bit vector of [8 x i32].
1389///
1390/// \code{.operation}
1391/// FOR i := 0 TO 7
1392/// j := i*16
1393/// k := i*32
1394/// result[k+31:k] := SignExtend(__V[j+15:j])
1395/// ENDFOR
1396/// \endcode
1397///
1398/// \headerfile <immintrin.h>
1399///
1400/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1401///
1402/// \param __V
1403/// A 128-bit vector of [8 x i16] containing the source values.
1404/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1405/// values.
1406static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1408 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1409}
1410
1411/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1412/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1413/// elements of a 256-bit vector of [4 x i64].
1414///
1415/// \code{.operation}
1416/// result[63:0] := SignExtend(__V[15:0])
1417/// result[127:64] := SignExtend(__V[31:16])
1418/// result[191:128] := SignExtend(__V[47:32])
1419/// result[255:192] := SignExtend(__V[64:48])
1420/// \endcode
1421///
1422/// \headerfile <immintrin.h>
1423///
1424/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1425///
1426/// \param __V
1427/// A 128-bit vector of [8 x i16] containing the source values.
1428/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1429/// values.
1430static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1432 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1433}
1434
1435/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1436/// \a __V and returns the 64-bit values in the corresponding elements of a
1437/// 256-bit vector of [4 x i64].
1438///
1439/// \code{.operation}
1440/// result[63:0] := SignExtend(__V[31:0])
1441/// result[127:64] := SignExtend(__V[63:32])
1442/// result[191:128] := SignExtend(__V[95:64])
1443/// result[255:192] := SignExtend(__V[127:96])
1444/// \endcode
1445///
1446/// \headerfile <immintrin.h>
1447///
1448/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1449///
1450/// \param __V
1451/// A 128-bit vector of [4 x i32] containing the source values.
1452/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1453/// values.
1454static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1456 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1457}
1458
1459/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1460/// the 16-bit values in the corresponding elements of a 256-bit vector
1461/// of [16 x i16].
1462///
1463/// \code{.operation}
1464/// FOR i := 0 TO 15
1465/// j := i*8
1466/// k := i*16
1467/// result[k+15:k] := ZeroExtend(__V[j+7:j])
1468/// ENDFOR
1469/// \endcode
1470///
1471/// \headerfile <immintrin.h>
1472///
1473/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1474///
1475/// \param __V
1476/// A 128-bit integer vector containing the source bytes.
1477/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1478/// values.
1479static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1481 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1482}
1483
1484/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1485/// \a __V and returns the 32-bit values in the corresponding elements of a
1486/// 256-bit vector of [8 x i32].
1487///
1488/// \code{.operation}
1489/// FOR i := 0 TO 7
1490/// j := i*8
1491/// k := i*32
1492/// result[k+31:k] := ZeroExtend(__V[j+7:j])
1493/// ENDFOR
1494/// \endcode
1495///
1496/// \headerfile <immintrin.h>
1497///
1498/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1499///
1500/// \param __V
1501/// A 128-bit integer vector containing the source bytes.
1502/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1503/// values.
1504static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1506 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1507}
1508
1509/// Zero-extends the first four bytes from the 128-bit integer vector in
1510/// \a __V and returns the 64-bit values in the corresponding elements of a
1511/// 256-bit vector of [4 x i64].
1512///
1513/// \code{.operation}
1514/// result[63:0] := ZeroExtend(__V[7:0])
1515/// result[127:64] := ZeroExtend(__V[15:8])
1516/// result[191:128] := ZeroExtend(__V[23:16])
1517/// result[255:192] := ZeroExtend(__V[31:24])
1518/// \endcode
1519///
1520/// \headerfile <immintrin.h>
1521///
1522/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1523///
1524/// \param __V
1525/// A 128-bit integer vector containing the source bytes.
1526/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1527/// values.
1528static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1530 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1531}
1532
1533/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1534/// \a __V and returns the 32-bit values in the corresponding elements of a
1535/// 256-bit vector of [8 x i32].
1536///
1537/// \code{.operation}
1538/// FOR i := 0 TO 7
1539/// j := i*16
1540/// k := i*32
1541/// result[k+31:k] := ZeroExtend(__V[j+15:j])
1542/// ENDFOR
1543/// \endcode
1544///
1545/// \headerfile <immintrin.h>
1546///
1547/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1548///
1549/// \param __V
1550/// A 128-bit vector of [8 x i16] containing the source values.
1551/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1552/// values.
1553static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1555 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1556}
1557
1558/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1559/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1560/// elements of a 256-bit vector of [4 x i64].
1561///
1562/// \code{.operation}
1563/// result[63:0] := ZeroExtend(__V[15:0])
1564/// result[127:64] := ZeroExtend(__V[31:16])
1565/// result[191:128] := ZeroExtend(__V[47:32])
1566/// result[255:192] := ZeroExtend(__V[64:48])
1567/// \endcode
1568///
1569/// \headerfile <immintrin.h>
1570///
1571/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1572///
1573/// \param __V
1574/// A 128-bit vector of [8 x i16] containing the source values.
1575/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1576/// values.
1577static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1579 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1580}
1581
1582/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1583/// \a __V and returns the 64-bit values in the corresponding elements of a
1584/// 256-bit vector of [4 x i64].
1585///
1586/// \code{.operation}
1587/// result[63:0] := ZeroExtend(__V[31:0])
1588/// result[127:64] := ZeroExtend(__V[63:32])
1589/// result[191:128] := ZeroExtend(__V[95:64])
1590/// result[255:192] := ZeroExtend(__V[127:96])
1591/// \endcode
1592///
1593/// \headerfile <immintrin.h>
1594///
1595/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1596///
1597/// \param __V
1598/// A 128-bit vector of [4 x i32] containing the source values.
1599/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1600/// values.
1601static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1603 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1604}
1605
1606/// Multiplies signed 32-bit integers from even-numbered elements of two
1607/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1608/// [4 x i64] result.
1609///
1610/// \code{.operation}
1611/// result[63:0] := __a[31:0] * __b[31:0]
1612/// result[127:64] := __a[95:64] * __b[95:64]
1613/// result[191:128] := __a[159:128] * __b[159:128]
1614/// result[255:192] := __a[223:192] * __b[223:192]
1615/// \endcode
1616///
1617/// \headerfile <immintrin.h>
1618///
1619/// This intrinsic corresponds to the \c VPMULDQ instruction.
1620///
1621/// \param __a
1622/// A 256-bit vector of [8 x i32] containing one of the source operands.
1623/// \param __b
1624/// A 256-bit vector of [8 x i32] containing one of the source operands.
1625/// \returns A 256-bit vector of [4 x i64] containing the products.
1626static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1627_mm256_mul_epi32(__m256i __a, __m256i __b) {
1628 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1629}
1630
1631/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1632/// [16 x i16], truncates the 32-bit results to the most significant 18
1633/// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1634/// product in the [16 x i16] result.
1635///
1636/// \code{.operation}
1637/// FOR i := 0 TO 15
1638/// j := i*16
1639/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1640/// result[j+15:j] := temp[16:1]
1641/// \endcode
1642///
1643/// \headerfile <immintrin.h>
1644///
1645/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1646///
1647/// \param __a
1648/// A 256-bit vector of [16 x i16] containing one of the source operands.
1649/// \param __b
1650/// A 256-bit vector of [16 x i16] containing one of the source operands.
1651/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1652static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1653_mm256_mulhrs_epi16(__m256i __a, __m256i __b) {
1654 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1655}
1656
1657/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1658/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1659/// [16 x i16] result.
1660///
1661/// \headerfile <immintrin.h>
1662///
1663/// This intrinsic corresponds to the \c VPMULHUW instruction.
1664///
1665/// \param __a
1666/// A 256-bit vector of [16 x i16] containing one of the source operands.
1667/// \param __b
1668/// A 256-bit vector of [16 x i16] containing one of the source operands.
1669/// \returns A 256-bit vector of [16 x i16] containing the products.
1670static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1671_mm256_mulhi_epu16(__m256i __a, __m256i __b) {
1672 return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
1673}
1674
1675/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1677/// [16 x i16] result.
1678///
1679/// \headerfile <immintrin.h>
1680///
1681/// This intrinsic corresponds to the \c VPMULHW instruction.
1682///
1683/// \param __a
1684/// A 256-bit vector of [16 x i16] containing one of the source operands.
1685/// \param __b
1686/// A 256-bit vector of [16 x i16] containing one of the source operands.
1687/// \returns A 256-bit vector of [16 x i16] containing the products.
1688static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1689_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1690{
1691 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1692}
1693
1694/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1695/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1696/// [16 x i16] result.
1697///
1698/// \headerfile <immintrin.h>
1699///
1700/// This intrinsic corresponds to the \c VPMULLW instruction.
1701///
1702/// \param __a
1703/// A 256-bit vector of [16 x i16] containing one of the source operands.
1704/// \param __b
1705/// A 256-bit vector of [16 x i16] containing one of the source operands.
1706/// \returns A 256-bit vector of [16 x i16] containing the products.
1707static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1708_mm256_mullo_epi16(__m256i __a, __m256i __b)
1709{
1710 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1711}
1712
1713/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1714/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1715/// [8 x i32] result.
1716///
1717/// \headerfile <immintrin.h>
1718///
1719/// This intrinsic corresponds to the \c VPMULLD instruction.
1720///
1721/// \param __a
1722/// A 256-bit vector of [8 x i32] containing one of the source operands.
1723/// \param __b
1724/// A 256-bit vector of [8 x i32] containing one of the source operands.
1725/// \returns A 256-bit vector of [8 x i32] containing the products.
1726static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1727_mm256_mullo_epi32(__m256i __a, __m256i __b) {
1728 return (__m256i)((__v8su)__a * (__v8su)__b);
1729}
1730
1731/// Multiplies unsigned 32-bit integers from even-numered elements of two
1732/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1733/// [4 x i64] result.
1734///
1735/// \code{.operation}
1736/// result[63:0] := __a[31:0] * __b[31:0]
1737/// result[127:64] := __a[95:64] * __b[95:64]
1738/// result[191:128] := __a[159:128] * __b[159:128]
1739/// result[255:192] := __a[223:192] * __b[223:192]
1740/// \endcode
1741///
1742/// \headerfile <immintrin.h>
1743///
1744/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1745///
1746/// \param __a
1747/// A 256-bit vector of [8 x i32] containing one of the source operands.
1748/// \param __b
1749/// A 256-bit vector of [8 x i32] containing one of the source operands.
1750/// \returns A 256-bit vector of [4 x i64] containing the products.
1751static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1752_mm256_mul_epu32(__m256i __a, __m256i __b) {
1753 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1754}
1755
1756/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1757/// \a __b.
1758///
1759/// \headerfile <immintrin.h>
1760///
1761/// This intrinsic corresponds to the \c VPOR instruction.
1762///
1763/// \param __a
1764/// A 256-bit integer vector.
1765/// \param __b
1766/// A 256-bit integer vector.
1767/// \returns A 256-bit integer vector containing the result.
1768static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1769_mm256_or_si256(__m256i __a, __m256i __b)
1770{
1771 return (__m256i)((__v4du)__a | (__v4du)__b);
1772}
1773
1774/// Computes four sum of absolute difference (SAD) operations on sets of eight
1775/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1776/// \a __b.
1777///
1778/// One SAD result is computed for each set of eight bytes from \a __a and
1779/// eight bytes from \a __b. The zero-extended SAD value is returned in the
1780/// corresponding 64-bit element of the result.
1781///
1782/// A single SAD operation takes the differences between the corresponding
1783/// bytes of \a __a and \a __b, takes the absolute value of each difference,
1784/// and sums these eight values to form one 16-bit result. This operation
1785/// is repeated four times with successive sets of eight bytes.
1786///
1787/// \code{.operation}
1788/// FOR i := 0 TO 3
1789/// j := i*64
1790/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1791/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1792/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1793/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1794/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1795/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1796/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1797/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1798/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1799/// temp4 + temp5 + temp6 + temp7
1800/// result[j+63:j+16] := 0
1801/// ENDFOR
1802/// \endcode
1803///
1804/// \headerfile <immintrin.h>
1805///
1806/// This intrinsic corresponds to the \c VPSADBW instruction.
1807///
1808/// \param __a
1809/// A 256-bit integer vector.
1810/// \param __b
1811/// A 256-bit integer vector.
1812/// \returns A 256-bit integer vector containing the result.
1813static __inline__ __m256i __DEFAULT_FN_ATTRS256
1814_mm256_sad_epu8(__m256i __a, __m256i __b)
1815{
1816 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1817}
1818
1819/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1820/// to control information in the 256-bit integer vector \a __b, and
1821/// returns the 256-bit result. In effect there are two separate 128-bit
1822/// shuffles in the lower and upper halves.
1823///
1824/// \code{.operation}
1825/// FOR i := 0 TO 31
1826/// j := i*8
1827/// IF __b[j+7] == 1
1828/// result[j+7:j] := 0
1829/// ELSE
1830/// k := __b[j+3:j] * 8
1831/// IF i > 15
1832/// k := k + 128
1833/// FI
1834/// result[j+7:j] := __a[k+7:k]
1835/// FI
1836/// ENDFOR
1837/// \endcode
1838///
1839/// \headerfile <immintrin.h>
1840///
1841/// This intrinsic corresponds to the \c VPSHUFB instruction.
1842///
1843/// \param __a
1844/// A 256-bit integer vector containing source values.
1845/// \param __b
1846/// A 256-bit integer vector containing control information to determine
1847/// what goes into the corresponding byte of the result. If bit 7 of the
1848/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1849/// control byte specify the index (within the same 128-bit half) of \a __a
1850/// to copy to the result byte.
1851/// \returns A 256-bit integer vector containing the result.
1852static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1853_mm256_shuffle_epi8(__m256i __a, __m256i __b) {
1854 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1855}
1856
1857/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1858/// according to control information in the integer literal \a imm, and
1859/// returns the 256-bit result. In effect there are two parallel 128-bit
1860/// shuffles in the lower and upper halves.
1861///
1862/// \code{.operation}
1863/// FOR i := 0 to 3
1864/// j := i*32
1865/// k := (imm >> i*2)[1:0] * 32
1866/// result[j+31:j] := a[k+31:k]
1867/// result[128+j+31:128+j] := a[128+k+31:128+k]
1868/// ENDFOR
1869/// \endcode
1870///
1871/// \headerfile <immintrin.h>
1872///
1873/// \code
1874/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1875/// \endcode
1876///
1877/// This intrinsic corresponds to the \c VPSHUFB instruction.
1878///
1879/// \param a
1880/// A 256-bit vector of [8 x i32] containing source values.
1881/// \param imm
1882/// An immediate 8-bit value specifying which elements to copy from \a a.
1883/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1884/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1885/// forth.
1886/// \returns A 256-bit vector of [8 x i32] containing the result.
1887#define _mm256_shuffle_epi32(a, imm) \
1888 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1889
1890/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1891/// according to control information in the integer literal \a imm, and
1892/// returns the 256-bit result. The upper 64 bits of each 128-bit half
1893/// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1894/// copied from \a a unchanged.
1895///
1896/// \code{.operation}
1897/// result[63:0] := a[63:0]
1898/// result[191:128] := a[191:128]
1899/// FOR i := 0 TO 3
1900/// j := i * 16 + 64
1901/// k := (imm >> i*2)[1:0] * 16 + 64
1902/// result[j+15:j] := a[k+15:k]
1903/// result[128+j+15:128+j] := a[128+k+15:128+k]
1904/// ENDFOR
1905/// \endcode
1906///
1907/// \headerfile <immintrin.h>
1908///
1909/// \code
1910/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1911/// \endcode
1912///
1913/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1914///
1915/// \param a
1916/// A 256-bit vector of [16 x i16] containing source values.
1917/// \param imm
1918/// An immediate 8-bit value specifying which elements to copy from \a a.
1919/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1920/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1921/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1922/// \returns A 256-bit vector of [16 x i16] containing the result.
1923#define _mm256_shufflehi_epi16(a, imm) \
1924 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1925
1926/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1927/// according to control information in the integer literal \a imm, and
1928/// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1929/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1930/// copied from \a a unchanged.
1931///
1932/// \code{.operation}
1933/// result[127:64] := a[127:64]
1934/// result[255:192] := a[255:192]
1935/// FOR i := 0 TO 3
1936/// j := i * 16
1937/// k := (imm >> i*2)[1:0] * 16
1938/// result[j+15:j] := a[k+15:k]
1939/// result[128+j+15:128+j] := a[128+k+15:128+k]
1940/// ENDFOR
1941/// \endcode
1942///
1943/// \headerfile <immintrin.h>
1944///
1945/// \code
1946/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1947/// \endcode
1948///
1949/// This intrinsic corresponds to the \c VPSHUFLW instruction.
1950///
1951/// \param a
1952/// A 256-bit vector of [16 x i16] to use as a source of data for the
1953/// result.
1954/// \param imm
1955/// An immediate 8-bit value specifying which elements to copy from \a a.
1956/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
1957/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
1958/// forth.
1959/// \returns A 256-bit vector of [16 x i16] containing the result.
1960#define _mm256_shufflelo_epi16(a, imm) \
1961 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
1962
1963/// Sets each byte of the result to the corresponding byte of the 256-bit
1964/// integer vector in \a __a, the negative of that byte, or zero, depending
1965/// on whether the corresponding byte of the 256-bit integer vector in
1966/// \a __b is greater than zero, less than zero, or equal to zero,
1967/// respectively.
1968///
1969/// \headerfile <immintrin.h>
1970///
1971/// This intrinsic corresponds to the \c VPSIGNB instruction.
1972///
1973/// \param __a
1974/// A 256-bit integer vector.
1975/// \param __b
1976/// A 256-bit integer vector].
1977/// \returns A 256-bit integer vector containing the result.
1978static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1979_mm256_sign_epi8(__m256i __a, __m256i __b) {
1980 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
1981}
1982
1983/// Sets each element of the result to the corresponding element of the
1984/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
1985/// or zero, depending on whether the corresponding element of the 256-bit
1986/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
1987/// equal to zero, respectively.
1988///
1989/// \headerfile <immintrin.h>
1990///
1991/// This intrinsic corresponds to the \c VPSIGNW instruction.
1992///
1993/// \param __a
1994/// A 256-bit vector of [16 x i16].
1995/// \param __b
1996/// A 256-bit vector of [16 x i16].
1997/// \returns A 256-bit vector of [16 x i16] containing the result.
1998static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1999_mm256_sign_epi16(__m256i __a, __m256i __b) {
2000 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2001}
2002
2003/// Sets each element of the result to the corresponding element of the
2004/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2005/// zero, depending on whether the corresponding element of the 256-bit
2006/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2007/// equal to zero, respectively.
2008///
2009/// \headerfile <immintrin.h>
2010///
2011/// This intrinsic corresponds to the \c VPSIGND instruction.
2012///
2013/// \param __a
2014/// A 256-bit vector of [8 x i32].
2015/// \param __b
2016/// A 256-bit vector of [8 x i32].
2017/// \returns A 256-bit vector of [8 x i32] containing the result.
2018static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2019_mm256_sign_epi32(__m256i __a, __m256i __b) {
2020 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2021}
2022
2023/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2024/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2025/// is greater than 15, the returned result is all zeroes.
2026///
2027/// \headerfile <immintrin.h>
2028///
2029/// \code
2030/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2031/// \endcode
2032///
2033/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2034///
2035/// \param a
2036/// A 256-bit integer vector to be shifted.
2037/// \param imm
2038/// An unsigned immediate value specifying the shift count (in bytes).
2039/// \returns A 256-bit integer vector containing the result.
2040#define _mm256_slli_si256(a, imm) \
2041 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2042 (int)(imm)))
2043
2044/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2045/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2046/// is greater than 15, the returned result is all zeroes.
2047///
2048/// \headerfile <immintrin.h>
2049///
2050/// \code
2051/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2052/// \endcode
2053///
2054/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2055///
2056/// \param a
2057/// A 256-bit integer vector to be shifted.
2058/// \param imm
2059/// An unsigned immediate value specifying the shift count (in bytes).
2060/// \returns A 256-bit integer vector containing the result.
2061#define _mm256_bslli_epi128(a, imm) \
2062 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2063 (int)(imm)))
2064
2065/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2066/// left by \a __count bits, shifting in zero bits, and returns the result.
2067/// If \a __count is greater than 15, the returned result is all zeroes.
2068///
2069/// \headerfile <immintrin.h>
2070///
2071/// This intrinsic corresponds to the \c VPSLLW instruction.
2072///
2073/// \param __a
2074/// A 256-bit vector of [16 x i16] to be shifted.
2075/// \param __count
2076/// An unsigned integer value specifying the shift count (in bits).
2077/// \returns A 256-bit vector of [16 x i16] containing the result.
2078static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2079_mm256_slli_epi16(__m256i __a, int __count) {
2080 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2081}
2082
2083/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2084/// left by the number of bits specified by the lower 64 bits of \a __count,
2085/// shifting in zero bits, and returns the result. If \a __count is greater
2086/// than 15, the returned result is all zeroes.
2087///
2088/// \headerfile <immintrin.h>
2089///
2090/// This intrinsic corresponds to the \c VPSLLW instruction.
2091///
2092/// \param __a
2093/// A 256-bit vector of [16 x i16] to be shifted.
2094/// \param __count
2095/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2096/// shift count (in bits). The upper element is ignored.
2097/// \returns A 256-bit vector of [16 x i16] containing the result.
2098static __inline__ __m256i __DEFAULT_FN_ATTRS256
2099_mm256_sll_epi16(__m256i __a, __m128i __count)
2100{
2101 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2102}
2103
2104/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2105/// left by \a __count bits, shifting in zero bits, and returns the result.
2106/// If \a __count is greater than 31, the returned result is all zeroes.
2107///
2108/// \headerfile <immintrin.h>
2109///
2110/// This intrinsic corresponds to the \c VPSLLD instruction.
2111///
2112/// \param __a
2113/// A 256-bit vector of [8 x i32] to be shifted.
2114/// \param __count
2115/// An unsigned integer value specifying the shift count (in bits).
2116/// \returns A 256-bit vector of [8 x i32] containing the result.
2117static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2118_mm256_slli_epi32(__m256i __a, int __count) {
2119 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2120}
2121
2122/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2123/// left by the number of bits given in the lower 64 bits of \a __count,
2124/// shifting in zero bits, and returns the result. If \a __count is greater
2125/// than 31, the returned result is all zeroes.
2126///
2127/// \headerfile <immintrin.h>
2128///
2129/// This intrinsic corresponds to the \c VPSLLD instruction.
2130///
2131/// \param __a
2132/// A 256-bit vector of [8 x i32] to be shifted.
2133/// \param __count
2134/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2135/// shift count (in bits). The upper element is ignored.
2136/// \returns A 256-bit vector of [8 x i32] containing the result.
2137static __inline__ __m256i __DEFAULT_FN_ATTRS256
2138_mm256_sll_epi32(__m256i __a, __m128i __count)
2139{
2140 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2141}
2142
2143/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2144/// left by \a __count bits, shifting in zero bits, and returns the result.
2145/// If \a __count is greater than 63, the returned result is all zeroes.
2146///
2147/// \headerfile <immintrin.h>
2148///
2149/// This intrinsic corresponds to the \c VPSLLQ instruction.
2150///
2151/// \param __a
2152/// A 256-bit vector of [4 x i64] to be shifted.
2153/// \param __count
2154/// An unsigned integer value specifying the shift count (in bits).
2155/// \returns A 256-bit vector of [4 x i64] containing the result.
2156static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2157_mm256_slli_epi64(__m256i __a, int __count) {
2158 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2159}
2160
2161/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2162/// left by the number of bits given in the lower 64 bits of \a __count,
2163/// shifting in zero bits, and returns the result. If \a __count is greater
2164/// than 63, the returned result is all zeroes.
2165///
2166/// \headerfile <immintrin.h>
2167///
2168/// This intrinsic corresponds to the \c VPSLLQ instruction.
2169///
2170/// \param __a
2171/// A 256-bit vector of [4 x i64] to be shifted.
2172/// \param __count
2173/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2174/// shift count (in bits). The upper element is ignored.
2175/// \returns A 256-bit vector of [4 x i64] containing the result.
2176static __inline__ __m256i __DEFAULT_FN_ATTRS256
2177_mm256_sll_epi64(__m256i __a, __m128i __count)
2178{
2179 return __builtin_ia32_psllq256((__v4di)__a, __count);
2180}
2181
2182/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2183/// right by \a __count bits, shifting in sign bits, and returns the result.
2184/// If \a __count is greater than 15, each element of the result is either
2185/// 0 or -1 according to the corresponding input sign bit.
2186///
2187/// \headerfile <immintrin.h>
2188///
2189/// This intrinsic corresponds to the \c VPSRAW instruction.
2190///
2191/// \param __a
2192/// A 256-bit vector of [16 x i16] to be shifted.
2193/// \param __count
2194/// An unsigned integer value specifying the shift count (in bits).
2195/// \returns A 256-bit vector of [16 x i16] containing the result.
2196static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2197_mm256_srai_epi16(__m256i __a, int __count) {
2198 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2199}
2200
2201/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2202/// right by the number of bits given in the lower 64 bits of \a __count,
2203/// shifting in sign bits, and returns the result. If \a __count is greater
2204/// than 15, each element of the result is either 0 or -1 according to the
2205/// corresponding input sign bit.
2206///
2207/// \headerfile <immintrin.h>
2208///
2209/// This intrinsic corresponds to the \c VPSRAW instruction.
2210///
2211/// \param __a
2212/// A 256-bit vector of [16 x i16] to be shifted.
2213/// \param __count
2214/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2215/// shift count (in bits). The upper element is ignored.
2216/// \returns A 256-bit vector of [16 x i16] containing the result.
2217static __inline__ __m256i __DEFAULT_FN_ATTRS256
2218_mm256_sra_epi16(__m256i __a, __m128i __count)
2219{
2220 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2221}
2222
2223/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2224/// right by \a __count bits, shifting in sign bits, and returns the result.
2225/// If \a __count is greater than 31, each element of the result is either
2226/// 0 or -1 according to the corresponding input sign bit.
2227///
2228/// \headerfile <immintrin.h>
2229///
2230/// This intrinsic corresponds to the \c VPSRAD instruction.
2231///
2232/// \param __a
2233/// A 256-bit vector of [8 x i32] to be shifted.
2234/// \param __count
2235/// An unsigned integer value specifying the shift count (in bits).
2236/// \returns A 256-bit vector of [8 x i32] containing the result.
2237static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2238_mm256_srai_epi32(__m256i __a, int __count) {
2239 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2240}
2241
2242/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2243/// right by the number of bits given in the lower 64 bits of \a __count,
2244/// shifting in sign bits, and returns the result. If \a __count is greater
2245/// than 31, each element of the result is either 0 or -1 according to the
2246/// corresponding input sign bit.
2247///
2248/// \headerfile <immintrin.h>
2249///
2250/// This intrinsic corresponds to the \c VPSRAD instruction.
2251///
2252/// \param __a
2253/// A 256-bit vector of [8 x i32] to be shifted.
2254/// \param __count
2255/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2256/// shift count (in bits). The upper element is ignored.
2257/// \returns A 256-bit vector of [8 x i32] containing the result.
2258static __inline__ __m256i __DEFAULT_FN_ATTRS256
2259_mm256_sra_epi32(__m256i __a, __m128i __count)
2260{
2261 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2262}
2263
2264/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2265/// \a imm bytes, shifting in zero bytes, and returns the result. If
2266/// \a imm is greater than 15, the returned result is all zeroes.
2267///
2268/// \headerfile <immintrin.h>
2269///
2270/// \code
2271/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2272/// \endcode
2273///
2274/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2275///
2276/// \param a
2277/// A 256-bit integer vector to be shifted.
2278/// \param imm
2279/// An unsigned immediate value specifying the shift count (in bytes).
2280/// \returns A 256-bit integer vector containing the result.
2281#define _mm256_srli_si256(a, imm) \
2282 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2283 (int)(imm)))
2284
2285/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2286/// \a imm bytes, shifting in zero bytes, and returns the result. If
2287/// \a imm is greater than 15, the returned result is all zeroes.
2288///
2289/// \headerfile <immintrin.h>
2290///
2291/// \code
2292/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2293/// \endcode
2294///
2295/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2296///
2297/// \param a
2298/// A 256-bit integer vector to be shifted.
2299/// \param imm
2300/// An unsigned immediate value specifying the shift count (in bytes).
2301/// \returns A 256-bit integer vector containing the result.
2302#define _mm256_bsrli_epi128(a, imm) \
2303 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2304 (int)(imm)))
2305
2306/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2307/// right by \a __count bits, shifting in zero bits, and returns the result.
2308/// If \a __count is greater than 15, the returned result is all zeroes.
2309///
2310/// \headerfile <immintrin.h>
2311///
2312/// This intrinsic corresponds to the \c VPSRLW instruction.
2313///
2314/// \param __a
2315/// A 256-bit vector of [16 x i16] to be shifted.
2316/// \param __count
2317/// An unsigned integer value specifying the shift count (in bits).
2318/// \returns A 256-bit vector of [16 x i16] containing the result.
2319static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2320_mm256_srli_epi16(__m256i __a, int __count) {
2321 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2322}
2323
2324/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2325/// right by the number of bits given in the lower 64 bits of \a __count,
2326/// shifting in zero bits, and returns the result. If \a __count is greater
2327/// than 15, the returned result is all zeroes.
2328///
2329/// \headerfile <immintrin.h>
2330///
2331/// This intrinsic corresponds to the \c VPSRLW instruction.
2332///
2333/// \param __a
2334/// A 256-bit vector of [16 x i16] to be shifted.
2335/// \param __count
2336/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2337/// shift count (in bits). The upper element is ignored.
2338/// \returns A 256-bit vector of [16 x i16] containing the result.
2339static __inline__ __m256i __DEFAULT_FN_ATTRS256
2340_mm256_srl_epi16(__m256i __a, __m128i __count)
2341{
2342 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2343}
2344
2345/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2346/// right by \a __count bits, shifting in zero bits, and returns the result.
2347/// If \a __count is greater than 31, the returned result is all zeroes.
2348///
2349/// \headerfile <immintrin.h>
2350///
2351/// This intrinsic corresponds to the \c VPSRLD instruction.
2352///
2353/// \param __a
2354/// A 256-bit vector of [8 x i32] to be shifted.
2355/// \param __count
2356/// An unsigned integer value specifying the shift count (in bits).
2357/// \returns A 256-bit vector of [8 x i32] containing the result.
2358static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2359_mm256_srli_epi32(__m256i __a, int __count) {
2360 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2361}
2362
2363/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2364/// right by the number of bits given in the lower 64 bits of \a __count,
2365/// shifting in zero bits, and returns the result. If \a __count is greater
2366/// than 31, the returned result is all zeroes.
2367///
2368/// \headerfile <immintrin.h>
2369///
2370/// This intrinsic corresponds to the \c VPSRLD instruction.
2371///
2372/// \param __a
2373/// A 256-bit vector of [8 x i32] to be shifted.
2374/// \param __count
2375/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2376/// shift count (in bits). The upper element is ignored.
2377/// \returns A 256-bit vector of [8 x i32] containing the result.
2378static __inline__ __m256i __DEFAULT_FN_ATTRS256
2379_mm256_srl_epi32(__m256i __a, __m128i __count)
2380{
2381 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2382}
2383
2384/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2385/// right by \a __count bits, shifting in zero bits, and returns the result.
2386/// If \a __count is greater than 63, the returned result is all zeroes.
2387///
2388/// \headerfile <immintrin.h>
2389///
2390/// This intrinsic corresponds to the \c VPSRLQ instruction.
2391///
2392/// \param __a
2393/// A 256-bit vector of [4 x i64] to be shifted.
2394/// \param __count
2395/// An unsigned integer value specifying the shift count (in bits).
2396/// \returns A 256-bit vector of [4 x i64] containing the result.
2397static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2398_mm256_srli_epi64(__m256i __a, int __count) {
2399 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2400}
2401
2402/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2403/// right by the number of bits given in the lower 64 bits of \a __count,
2404/// shifting in zero bits, and returns the result. If \a __count is greater
2405/// than 63, the returned result is all zeroes.
2406///
2407/// \headerfile <immintrin.h>
2408///
2409/// This intrinsic corresponds to the \c VPSRLQ instruction.
2410///
2411/// \param __a
2412/// A 256-bit vector of [4 x i64] to be shifted.
2413/// \param __count
2414/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2415/// shift count (in bits). The upper element is ignored.
2416/// \returns A 256-bit vector of [4 x i64] containing the result.
2417static __inline__ __m256i __DEFAULT_FN_ATTRS256
2418_mm256_srl_epi64(__m256i __a, __m128i __count)
2419{
2420 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2421}
2422
2423/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2424/// vectors. Returns the lower 8 bits of each difference in the
2425/// corresponding byte of the 256-bit integer vector result (overflow is
2426/// ignored).
2427///
2428/// \code{.operation}
2429/// FOR i := 0 TO 31
2430/// j := i*8
2431/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2432/// ENDFOR
2433/// \endcode
2434///
2435/// \headerfile <immintrin.h>
2436///
2437/// This intrinsic corresponds to the \c VPSUBB instruction.
2438///
2439/// \param __a
2440/// A 256-bit integer vector containing the minuends.
2441/// \param __b
2442/// A 256-bit integer vector containing the subtrahends.
2443/// \returns A 256-bit integer vector containing the differences.
2444static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2445_mm256_sub_epi8(__m256i __a, __m256i __b) {
2446 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2447}
2448
2449/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2450/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2451/// the corresponding element of the [16 x i16] result (overflow is
2452/// ignored).
2453///
2454/// \code{.operation}
2455/// FOR i := 0 TO 15
2456/// j := i*16
2457/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2458/// ENDFOR
2459/// \endcode
2460///
2461/// \headerfile <immintrin.h>
2462///
2463/// This intrinsic corresponds to the \c VPSUBW instruction.
2464///
2465/// \param __a
2466/// A 256-bit vector of [16 x i16] containing the minuends.
2467/// \param __b
2468/// A 256-bit vector of [16 x i16] containing the subtrahends.
2469/// \returns A 256-bit vector of [16 x i16] containing the differences.
2470static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2471_mm256_sub_epi16(__m256i __a, __m256i __b) {
2472 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2473}
2474
2475/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2476/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2477/// the corresponding element of the [8 x i32] result (overflow is ignored).
2478///
2479/// \code{.operation}
2480/// FOR i := 0 TO 7
2481/// j := i*32
2482/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2483/// ENDFOR
2484/// \endcode
2485///
2486/// \headerfile <immintrin.h>
2487///
2488/// This intrinsic corresponds to the \c VPSUBD instruction.
2489///
2490/// \param __a
2491/// A 256-bit vector of [8 x i32] containing the minuends.
2492/// \param __b
2493/// A 256-bit vector of [8 x i32] containing the subtrahends.
2494/// \returns A 256-bit vector of [8 x i32] containing the differences.
2495static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2496_mm256_sub_epi32(__m256i __a, __m256i __b) {
2497 return (__m256i)((__v8su)__a - (__v8su)__b);
2498}
2499
2500/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2501/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2502/// the corresponding element of the [4 x i64] result (overflow is ignored).
2503///
2504/// \code{.operation}
2505/// FOR i := 0 TO 3
2506/// j := i*64
2507/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2508/// ENDFOR
2509/// \endcode
2510///
2511/// \headerfile <immintrin.h>
2512///
2513/// This intrinsic corresponds to the \c VPSUBQ instruction.
2514///
2515/// \param __a
2516/// A 256-bit vector of [4 x i64] containing the minuends.
2517/// \param __b
2518/// A 256-bit vector of [4 x i64] containing the subtrahends.
2519/// \returns A 256-bit vector of [4 x i64] containing the differences.
2520static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2521_mm256_sub_epi64(__m256i __a, __m256i __b) {
2522 return (__m256i)((__v4du)__a - (__v4du)__b);
2523}
2524
2525/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2526/// vectors using signed saturation, and returns each differences in the
2527/// corresponding byte of the 256-bit integer vector result.
2528///
2529/// \code{.operation}
2530/// FOR i := 0 TO 31
2531/// j := i*8
2532/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2533/// ENDFOR
2534/// \endcode
2535///
2536/// \headerfile <immintrin.h>
2537///
2538/// This intrinsic corresponds to the \c VPSUBSB instruction.
2539///
2540/// \param __a
2541/// A 256-bit integer vector containing the minuends.
2542/// \param __b
2543/// A 256-bit integer vector containing the subtrahends.
2544/// \returns A 256-bit integer vector containing the differences.
2545static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2546_mm256_subs_epi8(__m256i __a, __m256i __b) {
2547 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2548}
2549
2550/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2551/// vectors of [16 x i16] using signed saturation, and returns each
2552/// difference in the corresponding element of the [16 x i16] result.
2553///
2554/// \code{.operation}
2555/// FOR i := 0 TO 15
2556/// j := i*16
2557/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2558/// ENDFOR
2559/// \endcode
2560///
2561/// \headerfile <immintrin.h>
2562///
2563/// This intrinsic corresponds to the \c VPSUBSW instruction.
2564///
2565/// \param __a
2566/// A 256-bit vector of [16 x i16] containing the minuends.
2567/// \param __b
2568/// A 256-bit vector of [16 x i16] containing the subtrahends.
2569/// \returns A 256-bit vector of [16 x i16] containing the differences.
2570static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2571_mm256_subs_epi16(__m256i __a, __m256i __b) {
2572 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2573}
2574
2575/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2576/// vectors using unsigned saturation, and returns each difference in the
2577/// corresponding byte of the 256-bit integer vector result. For each byte,
2578/// computes <c> result = __a - __b </c>.
2579///
2580/// \code{.operation}
2581/// FOR i := 0 TO 31
2582/// j := i*8
2583/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2584/// ENDFOR
2585/// \endcode
2586///
2587/// \headerfile <immintrin.h>
2588///
2589/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2590///
2591/// \param __a
2592/// A 256-bit integer vector containing the minuends.
2593/// \param __b
2594/// A 256-bit integer vector containing the subtrahends.
2595/// \returns A 256-bit integer vector containing the differences.
2596static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2597_mm256_subs_epu8(__m256i __a, __m256i __b) {
2598 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2599}
2600
2601/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2602/// vectors of [16 x i16] using unsigned saturation, and returns each
2603/// difference in the corresponding element of the [16 x i16] result.
2604///
2605/// \code{.operation}
2606/// FOR i := 0 TO 15
2607/// j := i*16
2608/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2609/// ENDFOR
2610/// \endcode
2611///
2612/// \headerfile <immintrin.h>
2613///
2614/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2615///
2616/// \param __a
2617/// A 256-bit vector of [16 x i16] containing the minuends.
2618/// \param __b
2619/// A 256-bit vector of [16 x i16] containing the subtrahends.
2620/// \returns A 256-bit vector of [16 x i16] containing the differences.
2621static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2622_mm256_subs_epu16(__m256i __a, __m256i __b) {
2623 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2624}
2625
2626/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2627/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2628/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2629/// input; other bits in these parameters are ignored.
2630///
2631/// \code{.operation}
2632/// result[7:0] := __a[71:64]
2633/// result[15:8] := __b[71:64]
2634/// result[23:16] := __a[79:72]
2635/// result[31:24] := __b[79:72]
2636/// . . .
2637/// result[127:120] := __b[127:120]
2638/// result[135:128] := __a[199:192]
2639/// . . .
2640/// result[255:248] := __b[255:248]
2641/// \endcode
2642///
2643/// \headerfile <immintrin.h>
2644///
2645/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2646///
2647/// \param __a
2648/// A 256-bit integer vector used as the source for the even-numbered bytes
2649/// of the result.
2650/// \param __b
2651/// A 256-bit integer vector used as the source for the odd-numbered bytes
2652/// of the result.
2653/// \returns A 256-bit integer vector containing the result.
2654static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2655_mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
2656 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2657}
2658
2659/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2660/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2661/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2662/// 128-bit half of \a __a and \a __b as input; other bits in these
2663/// parameters are ignored.
2664///
2665/// \code{.operation}
2666/// result[15:0] := __a[79:64]
2667/// result[31:16] := __b[79:64]
2668/// result[47:32] := __a[95:80]
2669/// result[63:48] := __b[95:80]
2670/// . . .
2671/// result[127:112] := __b[127:112]
2672/// result[143:128] := __a[211:196]
2673/// . . .
2674/// result[255:240] := __b[255:240]
2675/// \endcode
2676///
2677/// \headerfile <immintrin.h>
2678///
2679/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2680///
2681/// \param __a
2682/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2683/// elements of the result.
2684/// \param __b
2685/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2686/// elements of the result.
2687/// \returns A 256-bit vector of [16 x i16] containing the result.
2688static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2689_mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
2690 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2691}
2692
2693/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2694/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2695/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2696/// of \a __a and \a __b as input; other bits in these parameters are
2697/// ignored.
2698///
2699/// \code{.operation}
2700/// result[31:0] := __a[95:64]
2701/// result[63:32] := __b[95:64]
2702/// result[95:64] := __a[127:96]
2703/// result[127:96] := __b[127:96]
2704/// result[159:128] := __a[223:192]
2705/// result[191:160] := __b[223:192]
2706/// result[223:192] := __a[255:224]
2707/// result[255:224] := __b[255:224]
2708/// \endcode
2709///
2710/// \headerfile <immintrin.h>
2711///
2712/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2713///
2714/// \param __a
2715/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2716/// elements of the result.
2717/// \param __b
2718/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2719/// elements of the result.
2720/// \returns A 256-bit vector of [8 x i32] containing the result.
2721static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2722_mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
2723 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2724}
2725
2726/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2727/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2728/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2729/// of \a __a and \a __b as input; other bits in these parameters are
2730/// ignored.
2731///
2732/// \code{.operation}
2733/// result[63:0] := __a[127:64]
2734/// result[127:64] := __b[127:64]
2735/// result[191:128] := __a[255:192]
2736/// result[255:192] := __b[255:192]
2737/// \endcode
2738///
2739/// \headerfile <immintrin.h>
2740///
2741/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2742///
2743/// \param __a
2744/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2745/// elements of the result.
2746/// \param __b
2747/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2748/// elements of the result.
2749/// \returns A 256-bit vector of [4 x i64] containing the result.
2750static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2751_mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
2752 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2753}
2754
2755/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2756/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2757/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2758/// input; other bits in these parameters are ignored.
2759///
2760/// \code{.operation}
2761/// result[7:0] := __a[7:0]
2762/// result[15:8] := __b[7:0]
2763/// result[23:16] := __a[15:8]
2764/// result[31:24] := __b[15:8]
2765/// . . .
2766/// result[127:120] := __b[63:56]
2767/// result[135:128] := __a[135:128]
2768/// . . .
2769/// result[255:248] := __b[191:184]
2770/// \endcode
2771///
2772/// \headerfile <immintrin.h>
2773///
2774/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2775///
2776/// \param __a
2777/// A 256-bit integer vector used as the source for the even-numbered bytes
2778/// of the result.
2779/// \param __b
2780/// A 256-bit integer vector used as the source for the odd-numbered bytes
2781/// of the result.
2782/// \returns A 256-bit integer vector containing the result.
2783static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2784_mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
2785 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2786}
2787
2788/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2789/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2790/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2791/// 128-bit half of \a __a and \a __b as input; other bits in these
2792/// parameters are ignored.
2793///
2794/// \code{.operation}
2795/// result[15:0] := __a[15:0]
2796/// result[31:16] := __b[15:0]
2797/// result[47:32] := __a[31:16]
2798/// result[63:48] := __b[31:16]
2799/// . . .
2800/// result[127:112] := __b[63:48]
2801/// result[143:128] := __a[143:128]
2802/// . . .
2803/// result[255:239] := __b[191:176]
2804/// \endcode
2805///
2806/// \headerfile <immintrin.h>
2807///
2808/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2809///
2810/// \param __a
2811/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2812/// elements of the result.
2813/// \param __b
2814/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2815/// elements of the result.
2816/// \returns A 256-bit vector of [16 x i16] containing the result.
2817static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2818_mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
2819 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2820}
2821
2822/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2823/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2824/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2825/// of \a __a and \a __b as input; other bits in these parameters are
2826/// ignored.
2827///
2828/// \code{.operation}
2829/// result[31:0] := __a[31:0]
2830/// result[63:32] := __b[31:0]
2831/// result[95:64] := __a[63:32]
2832/// result[127:96] := __b[63:32]
2833/// result[159:128] := __a[159:128]
2834/// result[191:160] := __b[159:128]
2835/// result[223:192] := __a[191:160]
2836/// result[255:224] := __b[191:190]
2837/// \endcode
2838///
2839/// \headerfile <immintrin.h>
2840///
2841/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2842///
2843/// \param __a
2844/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2845/// elements of the result.
2846/// \param __b
2847/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2848/// elements of the result.
2849/// \returns A 256-bit vector of [8 x i32] containing the result.
2850static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2851_mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
2852 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2853}
2854
2855/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2856/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2857/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2858/// of \a __a and \a __b as input; other bits in these parameters are
2859/// ignored.
2860///
2861/// \code{.operation}
2862/// result[63:0] := __a[63:0]
2863/// result[127:64] := __b[63:0]
2864/// result[191:128] := __a[191:128]
2865/// result[255:192] := __b[191:128]
2866/// \endcode
2867///
2868/// \headerfile <immintrin.h>
2869///
2870/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2871///
2872/// \param __a
2873/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2874/// elements of the result.
2875/// \param __b
2876/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2877/// elements of the result.
2878/// \returns A 256-bit vector of [4 x i64] containing the result.
2879static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2880_mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
2881 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2882}
2883
2884/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2885/// \a __b.
2886///
2887/// \headerfile <immintrin.h>
2888///
2889/// This intrinsic corresponds to the \c VPXOR instruction.
2890///
2891/// \param __a
2892/// A 256-bit integer vector.
2893/// \param __b
2894/// A 256-bit integer vector.
2895/// \returns A 256-bit integer vector containing the result.
2896static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2897_mm256_xor_si256(__m256i __a, __m256i __b)
2898{
2899 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2900}
2901
2902/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2903/// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2904/// boundary.
2905///
2906/// \headerfile <immintrin.h>
2907///
2908/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2909///
2910/// \param __V
2911/// A pointer to the 32-byte aligned memory containing the vector to load.
2912/// \returns A 256-bit integer vector loaded from memory.
2913static __inline__ __m256i __DEFAULT_FN_ATTRS256
2915{
2916 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2917 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2918}
2919
2920/// Broadcasts the 32-bit floating-point value from the low element of the
2921/// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2922/// 128-bit vector of [4 x float].
2923///
2924/// \headerfile <immintrin.h>
2925///
2926/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2927///
2928/// \param __X
2929/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2930/// \returns A 128-bit vector of [4 x float] containing the result.
2931static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
2933 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
2934}
2935
2936/// Broadcasts the 64-bit floating-point value from the low element of the
2937/// 128-bit vector of [2 x double] in \a __a to both elements of the
2938/// result's 128-bit vector of [2 x double].
2939///
2940/// \headerfile <immintrin.h>
2941///
2942/// This intrinsic corresponds to the \c MOVDDUP instruction.
2943///
2944/// \param __a
2945/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2946/// \returns A 128-bit vector of [2 x double] containing the result.
2947static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
2949 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
2950}
2951
2952/// Broadcasts the 32-bit floating-point value from the low element of the
2953/// 128-bit vector of [4 x float] in \a __X to all elements of the
2954/// result's 256-bit vector of [8 x float].
2955///
2956/// \headerfile <immintrin.h>
2957///
2958/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2959///
2960/// \param __X
2961/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2962/// \returns A 256-bit vector of [8 x float] containing the result.
2963static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
2965 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2966}
2967
2968/// Broadcasts the 64-bit floating-point value from the low element of the
2969/// 128-bit vector of [2 x double] in \a __X to all elements of the
2970/// result's 256-bit vector of [4 x double].
2971///
2972/// \headerfile <immintrin.h>
2973///
2974/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
2975///
2976/// \param __X
2977/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2978/// \returns A 256-bit vector of [4 x double] containing the result.
2979static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
2981 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
2982}
2983
2984/// Broadcasts the 128-bit integer data from \a __X to both the lower and
2985/// upper halves of the 256-bit result.
2986///
2987/// \headerfile <immintrin.h>
2988///
2989/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
2990///
2991/// \param __X
2992/// A 128-bit integer vector to be broadcast.
2993/// \returns A 256-bit integer vector containing the result.
2994static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2996 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
2997}
2998
2999#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3000
3001/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3002/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3003/// as specified by the immediate integer operand \a M.
3004///
3005/// \code{.operation}
3006/// FOR i := 0 TO 3
3007/// j := i*32
3008/// IF M[i] == 0
3009/// result[31+j:j] := V1[31+j:j]
3010/// ELSE
3011/// result[31+j:j] := V2[32+j:j]
3012/// FI
3013/// ENDFOR
3014/// \endcode
3015///
3016/// \headerfile <immintrin.h>
3017///
3018/// \code
3019/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3020/// \endcode
3021///
3022/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3023///
3024/// \param V1
3025/// A 128-bit vector of [4 x i32] containing source values.
3026/// \param V2
3027/// A 128-bit vector of [4 x i32] containing source values.
3028/// \param M
3029/// An immediate 8-bit integer operand, with bits [3:0] specifying the
3030/// source for each element of the result. The position of the mask bit
3031/// corresponds to the index of a copied value. When a mask bit is 0, the
3032/// element is copied from \a V1; otherwise, it is copied from \a V2.
3033/// \returns A 128-bit vector of [4 x i32] containing the result.
3034#define _mm_blend_epi32(V1, V2, M) \
3035 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3036 (__v4si)(__m128i)(V2), (int)(M)))
3037
3038/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3039/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3040/// as specified by the immediate integer operand \a M.
3041///
3042/// \code{.operation}
3043/// FOR i := 0 TO 7
3044/// j := i*32
3045/// IF M[i] == 0
3046/// result[31+j:j] := V1[31+j:j]
3047/// ELSE
3048/// result[31+j:j] := V2[32+j:j]
3049/// FI
3050/// ENDFOR
3051/// \endcode
3052///
3053/// \headerfile <immintrin.h>
3054///
3055/// \code
3056/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3057/// \endcode
3058///
3059/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3060///
3061/// \param V1
3062/// A 256-bit vector of [8 x i32] containing source values.
3063/// \param V2
3064/// A 256-bit vector of [8 x i32] containing source values.
3065/// \param M
3066/// An immediate 8-bit integer operand, with bits [7:0] specifying the
3067/// source for each element of the result. The position of the mask bit
3068/// corresponds to the index of a copied value. When a mask bit is 0, the
3069/// element is copied from \a V1; otherwise, it is is copied from \a V2.
3070/// \returns A 256-bit vector of [8 x i32] containing the result.
3071#define _mm256_blend_epi32(V1, V2, M) \
3072 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3073 (__v8si)(__m256i)(V2), (int)(M)))
3074
3075/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3076/// bytes of the 256-bit result.
3077///
3078/// \headerfile <immintrin.h>
3079///
3080/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3081///
3082/// \param __X
3083/// A 128-bit integer vector whose low byte will be broadcast.
3084/// \returns A 256-bit integer vector containing the result.
3085static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3087 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3088}
3089
3090/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3091/// to all elements of the result's 256-bit vector of [16 x i16].
3092///
3093/// \headerfile <immintrin.h>
3094///
3095/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3096///
3097/// \param __X
3098/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3099/// \returns A 256-bit vector of [16 x i16] containing the result.
3100static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3102 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3103}
3104
3105/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3106/// to all elements of the result's 256-bit vector of [8 x i32].
3107///
3108/// \headerfile <immintrin.h>
3109///
3110/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3111///
3112/// \param __X
3113/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3114/// \returns A 256-bit vector of [8 x i32] containing the result.
3115static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3117 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3118}
3119
3120/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3121/// to all elements of the result's 256-bit vector of [4 x i64].
3122///
3123/// \headerfile <immintrin.h>
3124///
3125/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3126///
3127/// \param __X
3128/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3129/// \returns A 256-bit vector of [4 x i64] containing the result.
3130static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3132 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3133}
3134
3135/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3136/// bytes of the 128-bit result.
3137///
3138/// \headerfile <immintrin.h>
3139///
3140/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3141///
3142/// \param __X
3143/// A 128-bit integer vector whose low byte will be broadcast.
3144/// \returns A 128-bit integer vector containing the result.
3145static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3147 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3148}
3149
3150/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3151/// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3152///
3153/// \headerfile <immintrin.h>
3154///
3155/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3156///
3157/// \param __X
3158/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3159/// \returns A 128-bit vector of [8 x i16] containing the result.
3160static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3162 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3163}
3164
3165/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3166/// to all elements of the result's vector of [4 x i32].
3167///
3168/// \headerfile <immintrin.h>
3169///
3170/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3171///
3172/// \param __X
3173/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3174/// \returns A 128-bit vector of [4 x i32] containing the result.
3175static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3177 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3178}
3179
3180/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3181/// to both elements of the result's 128-bit vector of [2 x i64].
3182///
3183/// \headerfile <immintrin.h>
3184///
3185/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3186///
3187/// \param __X
3188/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3189/// \returns A 128-bit vector of [2 x i64] containing the result.
3190static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3192 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3193}
3194
3195/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3196/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3197/// elements of the 256-bit vector of [8 x i32] in \a __b.
3198///
3199/// \code{.operation}
3200/// FOR i := 0 TO 7
3201/// j := i*32
3202/// k := __b[j+2:j] * 32
3203/// result[j+31:j] := __a[k+31:k]
3204/// ENDFOR
3205/// \endcode
3206///
3207/// \headerfile <immintrin.h>
3208///
3209/// This intrinsic corresponds to the \c VPERMD instruction.
3210///
3211/// \param __a
3212/// A 256-bit vector of [8 x i32] containing the source values.
3213/// \param __b
3214/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3215/// \a __a.
3216/// \returns A 256-bit vector of [8 x i32] containing the result.
3217static __inline__ __m256i __DEFAULT_FN_ATTRS256
3219{
3220 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3221}
3222
3223/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3224/// the 256-bit vector of [4 x double] in \a V as specified by the
3225/// immediate value \a M.
3226///
3227/// \code{.operation}
3228/// FOR i := 0 TO 3
3229/// j := i*64
3230/// k := (M >> i*2)[1:0] * 64
3231/// result[j+63:j] := V[k+63:k]
3232/// ENDFOR
3233/// \endcode
3234///
3235/// \headerfile <immintrin.h>
3236///
3237/// \code
3238/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3239/// \endcode
3240///
3241/// This intrinsic corresponds to the \c VPERMPD instruction.
3242///
3243/// \param V
3244/// A 256-bit vector of [4 x double] containing the source values.
3245/// \param M
3246/// An immediate 8-bit value specifying which elements to copy from \a V.
3247/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3248/// \a M[3:2] specifies the index for element 1, and so forth.
3249/// \returns A 256-bit vector of [4 x double] containing the result.
3250#define _mm256_permute4x64_pd(V, M) \
3251 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3252
3253/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3254/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3255/// the elements of the 256-bit vector of [8 x i32] in \a __b.
3256///
3257/// \code{.operation}
3258/// FOR i := 0 TO 7
3259/// j := i*32
3260/// k := __b[j+2:j] * 32
3261/// result[j+31:j] := __a[k+31:k]
3262/// ENDFOR
3263/// \endcode
3264///
3265/// \headerfile <immintrin.h>
3266///
3267/// This intrinsic corresponds to the \c VPERMPS instruction.
3268///
3269/// \param __a
3270/// A 256-bit vector of [8 x float] containing the source values.
3271/// \param __b
3272/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3273/// \a __a.
3274/// \returns A 256-bit vector of [8 x float] containing the result.
3275static __inline__ __m256 __DEFAULT_FN_ATTRS256
3277{
3278 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3279}
3280
3281/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3282/// of the 256-bit vector of [4 x i64] in \a V as specified by the
3283/// immediate value \a M.
3284///
3285/// \code{.operation}
3286/// FOR i := 0 TO 3
3287/// j := i*64
3288/// k := (M >> i*2)[1:0] * 64
3289/// result[j+63:j] := V[k+63:k]
3290/// ENDFOR
3291/// \endcode
3292///
3293/// \headerfile <immintrin.h>
3294///
3295/// \code
3296/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3297/// \endcode
3298///
3299/// This intrinsic corresponds to the \c VPERMQ instruction.
3300///
3301/// \param V
3302/// A 256-bit vector of [4 x i64] containing the source values.
3303/// \param M
3304/// An immediate 8-bit value specifying which elements to copy from \a V.
3305/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3306/// \a M[3:2] specifies the index for element 1, and so forth.
3307/// \returns A 256-bit vector of [4 x i64] containing the result.
3308#define _mm256_permute4x64_epi64(V, M) \
3309 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3310
3311/// Sets each half of the 256-bit result either to zero or to one of the
3312/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3313/// as specified by the immediate value \a M.
3314///
3315/// \code{.operation}
3316/// FOR i := 0 TO 1
3317/// j := i*128
3318/// k := M >> (i*4)
3319/// IF k[3] == 0
3320/// CASE (k[1:0]) OF
3321/// 0: result[127+j:j] := V1[127:0]
3322/// 1: result[127+j:j] := V1[255:128]
3323/// 2: result[127+j:j] := V2[127:0]
3324/// 3: result[127+j:j] := V2[255:128]
3325/// ESAC
3326/// ELSE
3327/// result[127+j:j] := 0
3328/// FI
3329/// ENDFOR
3330/// \endcode
3331///
3332/// \headerfile <immintrin.h>
3333///
3334/// \code
3335/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3336/// \endcode
3337///
3338/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3339///
3340/// \param V1
3341/// A 256-bit integer vector containing source values.
3342/// \param V2
3343/// A 256-bit integer vector containing source values.
3344/// \param M
3345/// An immediate value specifying how to form the result. Bits [3:0]
3346/// control the lower half of the result, bits [7:4] control the upper half.
3347/// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3348/// otherwise bits [1:0] determine the source as follows. \n
3349/// 0: the lower half of \a V1 \n
3350/// 1: the upper half of \a V1 \n
3351/// 2: the lower half of \a V2 \n
3352/// 3: the upper half of \a V2
3353/// \returns A 256-bit integer vector containing the result.
3354#define _mm256_permute2x128_si256(V1, V2, M) \
3355 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3356
3357/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3358/// of the immediate \a M is zero, extracts the lower half of the result;
3359/// otherwise, extracts the upper half.
3360///
3361/// \headerfile <immintrin.h>
3362///
3363/// \code
3364/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3365/// \endcode
3366///
3367/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3368///
3369/// \param V
3370/// A 256-bit integer vector containing the source values.
3371/// \param M
3372/// An immediate value specifying which half of \a V to extract.
3373/// \returns A 128-bit integer vector containing the result.
3374#define _mm256_extracti128_si256(V, M) \
3375 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3376
3377/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3378/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3379/// is zero, overwrites the lower half of the result; otherwise,
3380/// overwrites the upper half.
3381///
3382/// \headerfile <immintrin.h>
3383///
3384/// \code
3385/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3386/// \endcode
3387///
3388/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3389///
3390/// \param V1
3391/// A 256-bit integer vector containing a source value.
3392/// \param V2
3393/// A 128-bit integer vector containing a source value.
3394/// \param M
3395/// An immediate value specifying where to put \a V2 in the result.
3396/// \returns A 256-bit integer vector containing the result.
3397#define _mm256_inserti128_si256(V1, V2, M) \
3398 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3399 (__v2di)(__m128i)(V2), (int)(M)))
3400
3401/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3402/// the most significant bit of the corresponding element in the mask
3403/// \a __M is set; otherwise, sets that element of the result to zero.
3404/// Returns the 256-bit [8 x i32] result.
3405///
3406/// \code{.operation}
3407/// FOR i := 0 TO 7
3408/// j := i*32
3409/// IF __M[j+31] == 1
3410/// result[j+31:j] := Load32(__X+(i*4))
3411/// ELSE
3412/// result[j+31:j] := 0
3413/// FI
3414/// ENDFOR
3415/// \endcode
3416///
3417/// \headerfile <immintrin.h>
3418///
3419/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3420///
3421/// \param __X
3422/// A pointer to the memory used for loading values.
3423/// \param __M
3424/// A 256-bit vector of [8 x i32] containing the mask bits.
3425/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3426/// elements.
3427static __inline__ __m256i __DEFAULT_FN_ATTRS256
3428_mm256_maskload_epi32(int const *__X, __m256i __M)
3429{
3430 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3431}
3432
3433/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3434/// the most significant bit of the corresponding element in the mask
3435/// \a __M is set; otherwise, sets that element of the result to zero.
3436/// Returns the 256-bit [4 x i64] result.
3437///
3438/// \code{.operation}
3439/// FOR i := 0 TO 3
3440/// j := i*64
3441/// IF __M[j+63] == 1
3442/// result[j+63:j] := Load64(__X+(i*8))
3443/// ELSE
3444/// result[j+63:j] := 0
3445/// FI
3446/// ENDFOR
3447/// \endcode
3448///
3449/// \headerfile <immintrin.h>
3450///
3451/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3452///
3453/// \param __X
3454/// A pointer to the memory used for loading values.
3455/// \param __M
3456/// A 256-bit vector of [4 x i64] containing the mask bits.
3457/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3458/// elements.
3459static __inline__ __m256i __DEFAULT_FN_ATTRS256
3460_mm256_maskload_epi64(long long const *__X, __m256i __M)
3461{
3462 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3463}
3464
3465/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3466/// the most significant bit of the corresponding element in the mask
3467/// \a __M is set; otherwise, sets that element of the result to zero.
3468/// Returns the 128-bit [4 x i32] result.
3469///
3470/// \code{.operation}
3471/// FOR i := 0 TO 3
3472/// j := i*32
3473/// IF __M[j+31] == 1
3474/// result[j+31:j] := Load32(__X+(i*4))
3475/// ELSE
3476/// result[j+31:j] := 0
3477/// FI
3478/// ENDFOR
3479/// \endcode
3480///
3481/// \headerfile <immintrin.h>
3482///
3483/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3484///
3485/// \param __X
3486/// A pointer to the memory used for loading values.
3487/// \param __M
3488/// A 128-bit vector of [4 x i32] containing the mask bits.
3489/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3490/// elements.
3491static __inline__ __m128i __DEFAULT_FN_ATTRS128
3492_mm_maskload_epi32(int const *__X, __m128i __M)
3493{
3494 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3495}
3496
3497/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3498/// the most significant bit of the corresponding element in the mask
3499/// \a __M is set; otherwise, sets that element of the result to zero.
3500/// Returns the 128-bit [2 x i64] result.
3501///
3502/// \code{.operation}
3503/// FOR i := 0 TO 1
3504/// j := i*64
3505/// IF __M[j+63] == 1
3506/// result[j+63:j] := Load64(__X+(i*8))
3507/// ELSE
3508/// result[j+63:j] := 0
3509/// FI
3510/// ENDFOR
3511/// \endcode
3512///
3513/// \headerfile <immintrin.h>
3514///
3515/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3516///
3517/// \param __X
3518/// A pointer to the memory used for loading values.
3519/// \param __M
3520/// A 128-bit vector of [2 x i64] containing the mask bits.
3521/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3522/// elements.
3523static __inline__ __m128i __DEFAULT_FN_ATTRS128
3524_mm_maskload_epi64(long long const *__X, __m128i __M)
3525{
3526 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3527}
3528
3529/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3530/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3531/// the corresponding element in the mask \a __M is set; otherwise, the
3532/// memory element is unchanged.
3533///
3534/// \code{.operation}
3535/// FOR i := 0 TO 7
3536/// j := i*32
3537/// IF __M[j+31] == 1
3538/// Store32(__X+(i*4), __Y[j+31:j])
3539/// FI
3540/// ENDFOR
3541/// \endcode
3542///
3543/// \headerfile <immintrin.h>
3544///
3545/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3546///
3547/// \param __X
3548/// A pointer to the memory used for storing values.
3549/// \param __M
3550/// A 256-bit vector of [8 x i32] containing the mask bits.
3551/// \param __Y
3552/// A 256-bit vector of [8 x i32] containing the values to store.
3553static __inline__ void __DEFAULT_FN_ATTRS256
3554_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3555{
3556 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3557}
3558
3559/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3560/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3561/// the corresponding element in the mask \a __M is set; otherwise, the
3562/// memory element is unchanged.
3563///
3564/// \code{.operation}
3565/// FOR i := 0 TO 3
3566/// j := i*64
3567/// IF __M[j+63] == 1
3568/// Store64(__X+(i*8), __Y[j+63:j])
3569/// FI
3570/// ENDFOR
3571/// \endcode
3572///
3573/// \headerfile <immintrin.h>
3574///
3575/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3576///
3577/// \param __X
3578/// A pointer to the memory used for storing values.
3579/// \param __M
3580/// A 256-bit vector of [4 x i64] containing the mask bits.
3581/// \param __Y
3582/// A 256-bit vector of [4 x i64] containing the values to store.
3583static __inline__ void __DEFAULT_FN_ATTRS256
3584_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3585{
3586 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3587}
3588
3589/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3590/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3591/// the corresponding element in the mask \a __M is set; otherwise, the
3592/// memory element is unchanged.
3593///
3594/// \code{.operation}
3595/// FOR i := 0 TO 3
3596/// j := i*32
3597/// IF __M[j+31] == 1
3598/// Store32(__X+(i*4), __Y[j+31:j])
3599/// FI
3600/// ENDFOR
3601/// \endcode
3602///
3603/// \headerfile <immintrin.h>
3604///
3605/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3606///
3607/// \param __X
3608/// A pointer to the memory used for storing values.
3609/// \param __M
3610/// A 128-bit vector of [4 x i32] containing the mask bits.
3611/// \param __Y
3612/// A 128-bit vector of [4 x i32] containing the values to store.
3613static __inline__ void __DEFAULT_FN_ATTRS128
3614_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3615{
3616 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3617}
3618
3619/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3620/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3621/// the corresponding element in the mask \a __M is set; otherwise, the
3622/// memory element is unchanged.
3623///
3624/// \code{.operation}
3625/// FOR i := 0 TO 1
3626/// j := i*64
3627/// IF __M[j+63] == 1
3628/// Store64(__X+(i*8), __Y[j+63:j])
3629/// FI
3630/// ENDFOR
3631/// \endcode
3632///
3633/// \headerfile <immintrin.h>
3634///
3635/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3636///
3637/// \param __X
3638/// A pointer to the memory used for storing values.
3639/// \param __M
3640/// A 128-bit vector of [2 x i64] containing the mask bits.
3641/// \param __Y
3642/// A 128-bit vector of [2 x i64] containing the values to store.
3643static __inline__ void __DEFAULT_FN_ATTRS128
3644_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3645{
3646 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3647}
3648
3649/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3650/// left by the number of bits given in the corresponding element of the
3651/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3652/// returns the result. If the shift count for any element is greater than
3653/// 31, the result for that element is zero.
3654///
3655/// \headerfile <immintrin.h>
3656///
3657/// This intrinsic corresponds to the \c VPSLLVD instruction.
3658///
3659/// \param __X
3660/// A 256-bit vector of [8 x i32] to be shifted.
3661/// \param __Y
3662/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3663/// bits).
3664/// \returns A 256-bit vector of [8 x i32] containing the result.
3665static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3666_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3667{
3668 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3669}
3670
3671/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3672/// left by the number of bits given in the corresponding element of the
3673/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3674/// returns the result. If the shift count for any element is greater than
3675/// 31, the result for that element is zero.
3676///
3677/// \headerfile <immintrin.h>
3678///
3679/// This intrinsic corresponds to the \c VPSLLVD instruction.
3680///
3681/// \param __X
3682/// A 128-bit vector of [4 x i32] to be shifted.
3683/// \param __Y
3684/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3685/// bits).
3686/// \returns A 128-bit vector of [4 x i32] containing the result.
3687static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3688_mm_sllv_epi32(__m128i __X, __m128i __Y)
3689{
3690 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3691}
3692
3693/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3694/// left by the number of bits given in the corresponding element of the
3695/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3696/// returns the result. If the shift count for any element is greater than
3697/// 63, the result for that element is zero.
3698///
3699/// \headerfile <immintrin.h>
3700///
3701/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3702///
3703/// \param __X
3704/// A 256-bit vector of [4 x i64] to be shifted.
3705/// \param __Y
3706/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3707/// bits).
3708/// \returns A 256-bit vector of [4 x i64] containing the result.
3709static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3710_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3711{
3712 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3713}
3714
3715/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3716/// left by the number of bits given in the corresponding element of the
3717/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3718/// returns the result. If the shift count for any element is greater than
3719/// 63, the result for that element is zero.
3720///
3721/// \headerfile <immintrin.h>
3722///
3723/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3724///
3725/// \param __X
3726/// A 128-bit vector of [2 x i64] to be shifted.
3727/// \param __Y
3728/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3729/// bits).
3730/// \returns A 128-bit vector of [2 x i64] containing the result.
3731static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3732_mm_sllv_epi64(__m128i __X, __m128i __Y)
3733{
3734 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3735}
3736
3737/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3738/// right by the number of bits given in the corresponding element of the
3739/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3740/// returns the result. If the shift count for any element is greater than
3741/// 31, the result for that element is 0 or -1 according to the sign bit
3742/// for that element.
3743///
3744/// \headerfile <immintrin.h>
3745///
3746/// This intrinsic corresponds to the \c VPSRAVD instruction.
3747///
3748/// \param __X
3749/// A 256-bit vector of [8 x i32] to be shifted.
3750/// \param __Y
3751/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3752/// bits).
3753/// \returns A 256-bit vector of [8 x i32] containing the result.
3754static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3755_mm256_srav_epi32(__m256i __X, __m256i __Y)
3756{
3757 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3758}
3759
3760/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3761/// right by the number of bits given in the corresponding element of the
3762/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3763/// returns the result. If the shift count for any element is greater than
3764/// 31, the result for that element is 0 or -1 according to the sign bit
3765/// for that element.
3766///
3767/// \headerfile <immintrin.h>
3768///
3769/// This intrinsic corresponds to the \c VPSRAVD instruction.
3770///
3771/// \param __X
3772/// A 128-bit vector of [4 x i32] to be shifted.
3773/// \param __Y
3774/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3775/// bits).
3776/// \returns A 128-bit vector of [4 x i32] containing the result.
3777static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3778_mm_srav_epi32(__m128i __X, __m128i __Y)
3779{
3780 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3781}
3782
3783/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3784/// right by the number of bits given in the corresponding element of the
3785/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3786/// returns the result. If the shift count for any element is greater than
3787/// 31, the result for that element is zero.
3788///
3789/// \headerfile <immintrin.h>
3790///
3791/// This intrinsic corresponds to the \c VPSRLVD instruction.
3792///
3793/// \param __X
3794/// A 256-bit vector of [8 x i32] to be shifted.
3795/// \param __Y
3796/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3797/// bits).
3798/// \returns A 256-bit vector of [8 x i32] containing the result.
3799static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3800_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3801{
3802 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3803}
3804
3805/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3806/// right by the number of bits given in the corresponding element of the
3807/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3808/// returns the result. If the shift count for any element is greater than
3809/// 31, the result for that element is zero.
3810///
3811/// \headerfile <immintrin.h>
3812///
3813/// This intrinsic corresponds to the \c VPSRLVD instruction.
3814///
3815/// \param __X
3816/// A 128-bit vector of [4 x i32] to be shifted.
3817/// \param __Y
3818/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3819/// bits).
3820/// \returns A 128-bit vector of [4 x i32] containing the result.
3821static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3822_mm_srlv_epi32(__m128i __X, __m128i __Y)
3823{
3824 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3825}
3826
3827/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3828/// right by the number of bits given in the corresponding element of the
3829/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3830/// returns the result. If the shift count for any element is greater than
3831/// 63, the result for that element is zero.
3832///
3833/// \headerfile <immintrin.h>
3834///
3835/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3836///
3837/// \param __X
3838/// A 256-bit vector of [4 x i64] to be shifted.
3839/// \param __Y
3840/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3841/// bits).
3842/// \returns A 256-bit vector of [4 x i64] containing the result.
3843static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3844_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3845{
3846 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3847}
3848
3849/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3850/// right by the number of bits given in the corresponding element of the
3851/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3852/// returns the result. If the shift count for any element is greater than
3853/// 63, the result for that element is zero.
3854///
3855/// \headerfile <immintrin.h>
3856///
3857/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3858///
3859/// \param __X
3860/// A 128-bit vector of [2 x i64] to be shifted.
3861/// \param __Y
3862/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3863/// bits).
3864/// \returns A 128-bit vector of [2 x i64] containing the result.
3865static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3866_mm_srlv_epi64(__m128i __X, __m128i __Y)
3867{
3868 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3869}
3870
3871/// Conditionally gathers two 64-bit floating-point values, either from the
3872/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3873/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3874/// of [2 x double] in \a mask determines the source for each element.
3875///
3876/// \code{.operation}
3877/// FOR element := 0 to 1
3878/// j := element*64
3879/// k := element*32
3880/// IF mask[j+63] == 0
3881/// result[j+63:j] := a[j+63:j]
3882/// ELSE
3883/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3884/// FI
3885/// ENDFOR
3886/// \endcode
3887///
3888/// \headerfile <immintrin.h>
3889///
3890/// \code
3891/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3892/// __m128d mask, const int s);
3893/// \endcode
3894///
3895/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3896///
3897/// \param a
3898/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3899/// zero.
3900/// \param m
3901/// A pointer to the memory used for loading values.
3902/// \param i
3903/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3904/// the first two elements are used.
3905/// \param mask
3906/// A 128-bit vector of [2 x double] containing the mask. The most
3907/// significant bit of each element in the mask vector represents the mask
3908/// bits. If a mask bit is zero, the corresponding value from vector \a a
3909/// is gathered; otherwise the value is loaded from memory.
3910/// \param s
3911/// A literal constant scale factor for the indexes in \a i. Must be
3912/// 1, 2, 4, or 8.
3913/// \returns A 128-bit vector of [2 x double] containing the gathered values.
3914#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
3915 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
3916 (double const *)(m), \
3917 (__v4si)(__m128i)(i), \
3918 (__v2df)(__m128d)(mask), (s)))
3919
3920/// Conditionally gathers four 64-bit floating-point values, either from the
3921/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
3922/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
3923/// of [4 x double] in \a mask determines the source for each element.
3924///
3925/// \code{.operation}
3926/// FOR element := 0 to 3
3927/// j := element*64
3928/// k := element*32
3929/// IF mask[j+63] == 0
3930/// result[j+63:j] := a[j+63:j]
3931/// ELSE
3932/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3933/// FI
3934/// ENDFOR
3935/// \endcode
3936///
3937/// \headerfile <immintrin.h>
3938///
3939/// \code
3940/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
3941/// __m256d mask, const int s);
3942/// \endcode
3943///
3944/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3945///
3946/// \param a
3947/// A 256-bit vector of [4 x double] used as the source when a mask bit is
3948/// zero.
3949/// \param m
3950/// A pointer to the memory used for loading values.
3951/// \param i
3952/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3953/// \param mask
3954/// A 256-bit vector of [4 x double] containing the mask. The most
3955/// significant bit of each element in the mask vector represents the mask
3956/// bits. If a mask bit is zero, the corresponding value from vector \a a
3957/// is gathered; otherwise the value is loaded from memory.
3958/// \param s
3959/// A literal constant scale factor for the indexes in \a i. Must be
3960/// 1, 2, 4, or 8.
3961/// \returns A 256-bit vector of [4 x double] containing the gathered values.
3962#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
3963 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
3964 (double const *)(m), \
3965 (__v4si)(__m128i)(i), \
3966 (__v4df)(__m256d)(mask), (s)))
3967
3968/// Conditionally gathers two 64-bit floating-point values, either from the
3969/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3970/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3971/// of [2 x double] in \a mask determines the source for each element.
3972///
3973/// \code{.operation}
3974/// FOR element := 0 to 1
3975/// j := element*64
3976/// k := element*64
3977/// IF mask[j+63] == 0
3978/// result[j+63:j] := a[j+63:j]
3979/// ELSE
3980/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3981/// FI
3982/// ENDFOR
3983/// \endcode
3984///
3985/// \headerfile <immintrin.h>
3986///
3987/// \code
3988/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
3989/// __m128d mask, const int s);
3990/// \endcode
3991///
3992/// This intrinsic corresponds to the \c VGATHERQPD instruction.
3993///
3994/// \param a
3995/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3996/// zero.
3997/// \param m
3998/// A pointer to the memory used for loading values.
3999/// \param i
4000/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4001/// \param mask
4002/// A 128-bit vector of [2 x double] containing the mask. The most
4003/// significant bit of each element in the mask vector represents the mask
4004/// bits. If a mask bit is zero, the corresponding value from vector \a a
4005/// is gathered; otherwise the value is loaded from memory.
4006/// \param s
4007/// A literal constant scale factor for the indexes in \a i. Must be
4008/// 1, 2, 4, or 8.
4009/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4010#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4011 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4012 (double const *)(m), \
4013 (__v2di)(__m128i)(i), \
4014 (__v2df)(__m128d)(mask), (s)))
4015
4016/// Conditionally gathers four 64-bit floating-point values, either from the
4017/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4018/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4019/// of [4 x double] in \a mask determines the source for each element.
4020///
4021/// \code{.operation}
4022/// FOR element := 0 to 3
4023/// j := element*64
4024/// k := element*64
4025/// IF mask[j+63] == 0
4026/// result[j+63:j] := a[j+63:j]
4027/// ELSE
4028/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4029/// FI
4030/// ENDFOR
4031/// \endcode
4032///
4033/// \headerfile <immintrin.h>
4034///
4035/// \code
4036/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4037/// __m256d mask, const int s);
4038/// \endcode
4039///
4040/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4041///
4042/// \param a
4043/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4044/// zero.
4045/// \param m
4046/// A pointer to the memory used for loading values.
4047/// \param i
4048/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4049/// \param mask
4050/// A 256-bit vector of [4 x double] containing the mask. The most
4051/// significant bit of each element in the mask vector represents the mask
4052/// bits. If a mask bit is zero, the corresponding value from vector \a a
4053/// is gathered; otherwise the value is loaded from memory.
4054/// \param s
4055/// A literal constant scale factor for the indexes in \a i. Must be
4056/// 1, 2, 4, or 8.
4057/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4058#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4059 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4060 (double const *)(m), \
4061 (__v4di)(__m256i)(i), \
4062 (__v4df)(__m256d)(mask), (s)))
4063
4064/// Conditionally gathers four 32-bit floating-point values, either from the
4065/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4066/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4067/// of [4 x float] in \a mask determines the source for each element.
4068///
4069/// \code{.operation}
4070/// FOR element := 0 to 3
4071/// j := element*32
4072/// k := element*32
4073/// IF mask[j+31] == 0
4074/// result[j+31:j] := a[j+31:j]
4075/// ELSE
4076/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4077/// FI
4078/// ENDFOR
4079/// \endcode
4080///
4081/// \headerfile <immintrin.h>
4082///
4083/// \code
4084/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4085/// __m128 mask, const int s);
4086/// \endcode
4087///
4088/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4089///
4090/// \param a
4091/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4092/// zero.
4093/// \param m
4094/// A pointer to the memory used for loading values.
4095/// \param i
4096/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4097/// \param mask
4098/// A 128-bit vector of [4 x float] containing the mask. The most
4099/// significant bit of each element in the mask vector represents the mask
4100/// bits. If a mask bit is zero, the corresponding value from vector \a a
4101/// is gathered; otherwise the value is loaded from memory.
4102/// \param s
4103/// A literal constant scale factor for the indexes in \a i. Must be
4104/// 1, 2, 4, or 8.
4105/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4106#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4107 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4108 (float const *)(m), \
4109 (__v4si)(__m128i)(i), \
4110 (__v4sf)(__m128)(mask), (s)))
4111
4112/// Conditionally gathers eight 32-bit floating-point values, either from the
4113/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4114/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4115/// of [8 x float] in \a mask determines the source for each element.
4116///
4117/// \code{.operation}
4118/// FOR element := 0 to 7
4119/// j := element*32
4120/// k := element*32
4121/// IF mask[j+31] == 0
4122/// result[j+31:j] := a[j+31:j]
4123/// ELSE
4124/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4125/// FI
4126/// ENDFOR
4127/// \endcode
4128///
4129/// \headerfile <immintrin.h>
4130///
4131/// \code
4132/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4133/// __m256 mask, const int s);
4134/// \endcode
4135///
4136/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4137///
4138/// \param a
4139/// A 256-bit vector of [8 x float] used as the source when a mask bit is
4140/// zero.
4141/// \param m
4142/// A pointer to the memory used for loading values.
4143/// \param i
4144/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4145/// \param mask
4146/// A 256-bit vector of [8 x float] containing the mask. The most
4147/// significant bit of each element in the mask vector represents the mask
4148/// bits. If a mask bit is zero, the corresponding value from vector \a a
4149/// is gathered; otherwise the value is loaded from memory.
4150/// \param s
4151/// A literal constant scale factor for the indexes in \a i. Must be
4152/// 1, 2, 4, or 8.
4153/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4154#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4155 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4156 (float const *)(m), \
4157 (__v8si)(__m256i)(i), \
4158 (__v8sf)(__m256)(mask), (s)))
4159
4160/// Conditionally gathers two 32-bit floating-point values, either from the
4161/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4162/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4163/// of [4 x float] in \a mask determines the source for the lower two
4164/// elements. The upper two elements of the result are zeroed.
4165///
4166/// \code{.operation}
4167/// FOR element := 0 to 1
4168/// j := element*32
4169/// k := element*64
4170/// IF mask[j+31] == 0
4171/// result[j+31:j] := a[j+31:j]
4172/// ELSE
4173/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4174/// FI
4175/// ENDFOR
4176/// result[127:64] := 0
4177/// \endcode
4178///
4179/// \headerfile <immintrin.h>
4180///
4181/// \code
4182/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4183/// __m128 mask, const int s);
4184/// \endcode
4185///
4186/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4187///
4188/// \param a
4189/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4190/// zero. Only the first two elements are used.
4191/// \param m
4192/// A pointer to the memory used for loading values.
4193/// \param i
4194/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4195/// \param mask
4196/// A 128-bit vector of [4 x float] containing the mask. The most
4197/// significant bit of each element in the mask vector represents the mask
4198/// bits. If a mask bit is zero, the corresponding value from vector \a a
4199/// is gathered; otherwise the value is loaded from memory. Only the first
4200/// two elements are used.
4201/// \param s
4202/// A literal constant scale factor for the indexes in \a i. Must be
4203/// 1, 2, 4, or 8.
4204/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4205#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4206 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4207 (float const *)(m), \
4208 (__v2di)(__m128i)(i), \
4209 (__v4sf)(__m128)(mask), (s)))
4210
4211/// Conditionally gathers four 32-bit floating-point values, either from the
4212/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4213/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4214/// of [4 x float] in \a mask determines the source for each element.
4215///
4216/// \code{.operation}
4217/// FOR element := 0 to 3
4218/// j := element*32
4219/// k := element*64
4220/// IF mask[j+31] == 0
4221/// result[j+31:j] := a[j+31:j]
4222/// ELSE
4223/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4224/// FI
4225/// ENDFOR
4226/// \endcode
4227///
4228/// \headerfile <immintrin.h>
4229///
4230/// \code
4231/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4232/// __m128 mask, const int s);
4233/// \endcode
4234///
4235/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4236///
4237/// \param a
4238/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4239/// zero.
4240/// \param m
4241/// A pointer to the memory used for loading values.
4242/// \param i
4243/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4244/// \param mask
4245/// A 128-bit vector of [4 x float] containing the mask. The most
4246/// significant bit of each element in the mask vector represents the mask
4247/// bits. If a mask bit is zero, the corresponding value from vector \a a
4248/// is gathered; otherwise the value is loaded from memory.
4249/// \param s
4250/// A literal constant scale factor for the indexes in \a i. Must be
4251/// 1, 2, 4, or 8.
4252/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4253#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4254 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4255 (float const *)(m), \
4256 (__v4di)(__m256i)(i), \
4257 (__v4sf)(__m128)(mask), (s)))
4258
4259/// Conditionally gathers four 32-bit integer values, either from the
4260/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4261/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4262/// of [4 x i32] in \a mask determines the source for each element.
4263///
4264/// \code{.operation}
4265/// FOR element := 0 to 3
4266/// j := element*32
4267/// k := element*32
4268/// IF mask[j+31] == 0
4269/// result[j+31:j] := a[j+31:j]
4270/// ELSE
4271/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4272/// FI
4273/// ENDFOR
4274/// \endcode
4275///
4276/// \headerfile <immintrin.h>
4277///
4278/// \code
4279/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4280/// __m128i mask, const int s);
4281/// \endcode
4282///
4283/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4284///
4285/// \param a
4286/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4287/// zero.
4288/// \param m
4289/// A pointer to the memory used for loading values.
4290/// \param i
4291/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4292/// \param mask
4293/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4294/// bit of each element in the mask vector represents the mask bits. If a
4295/// mask bit is zero, the corresponding value from vector \a a is gathered;
4296/// otherwise the value is loaded from memory.
4297/// \param s
4298/// A literal constant scale factor for the indexes in \a i. Must be
4299/// 1, 2, 4, or 8.
4300/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4301#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4302 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4303 (int const *)(m), \
4304 (__v4si)(__m128i)(i), \
4305 (__v4si)(__m128i)(mask), (s)))
4306
4307/// Conditionally gathers eight 32-bit integer values, either from the
4308/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4309/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4310/// of [8 x i32] in \a mask determines the source for each element.
4311///
4312/// \code{.operation}
4313/// FOR element := 0 to 7
4314/// j := element*32
4315/// k := element*32
4316/// IF mask[j+31] == 0
4317/// result[j+31:j] := a[j+31:j]
4318/// ELSE
4319/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4320/// FI
4321/// ENDFOR
4322/// \endcode
4323///
4324/// \headerfile <immintrin.h>
4325///
4326/// \code
4327/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4328/// __m256i mask, const int s);
4329/// \endcode
4330///
4331/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4332///
4333/// \param a
4334/// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4335/// zero.
4336/// \param m
4337/// A pointer to the memory used for loading values.
4338/// \param i
4339/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4340/// \param mask
4341/// A 256-bit vector of [8 x i32] containing the mask. The most significant
4342/// bit of each element in the mask vector represents the mask bits. If a
4343/// mask bit is zero, the corresponding value from vector \a a is gathered;
4344/// otherwise the value is loaded from memory.
4345/// \param s
4346/// A literal constant scale factor for the indexes in \a i. Must be
4347/// 1, 2, 4, or 8.
4348/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4349#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4350 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4351 (int const *)(m), \
4352 (__v8si)(__m256i)(i), \
4353 (__v8si)(__m256i)(mask), (s)))
4354
4355/// Conditionally gathers two 32-bit integer values, either from the
4356/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4357/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4358/// of [4 x i32] in \a mask determines the source for the lower two
4359/// elements. The upper two elements of the result are zeroed.
4360///
4361/// \code{.operation}
4362/// FOR element := 0 to 1
4363/// j := element*32
4364/// k := element*64
4365/// IF mask[j+31] == 0
4366/// result[j+31:j] := a[j+31:j]
4367/// ELSE
4368/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4369/// FI
4370/// ENDFOR
4371/// result[127:64] := 0
4372/// \endcode
4373///
4374/// \headerfile <immintrin.h>
4375///
4376/// \code
4377/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4378/// __m128i mask, const int s);
4379/// \endcode
4380///
4381/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4382///
4383/// \param a
4384/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4385/// zero. Only the first two elements are used.
4386/// \param m
4387/// A pointer to the memory used for loading values.
4388/// \param i
4389/// A 128-bit vector of [2 x i64] containing indexes into \a m.
4390/// \param mask
4391/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4392/// bit of each element in the mask vector represents the mask bits. If a
4393/// mask bit is zero, the corresponding value from vector \a a is gathered;
4394/// otherwise the value is loaded from memory. Only the first two elements
4395/// are used.
4396/// \param s
4397/// A literal constant scale factor for the indexes in \a i. Must be
4398/// 1, 2, 4, or 8.
4399/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4400#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4401 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4402 (int const *)(m), \
4403 (__v2di)(__m128i)(i), \
4404 (__v4si)(__m128i)(mask), (s)))
4405
4406/// Conditionally gathers four 32-bit integer values, either from the
4407/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4408/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4409/// of [4 x i32] in \a mask determines the source for each element.
4410///
4411/// \code{.operation}
4412/// FOR element := 0 to 3
4413/// j := element*32
4414/// k := element*64
4415/// IF mask[j+31] == 0
4416/// result[j+31:j] := a[j+31:j]
4417/// ELSE
4418/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4419/// FI
4420/// ENDFOR
4421/// \endcode
4422///
4423/// \headerfile <immintrin.h>
4424///
4425/// \code
4426/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4427/// __m128i mask, const int s);
4428/// \endcode
4429///
4430/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4431///
4432/// \param a
4433/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4434/// zero.
4435/// \param m
4436/// A pointer to the memory used for loading values.
4437/// \param i
4438/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4439/// \param mask
4440/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4441/// bit of each element in the mask vector represents the mask bits. If a
4442/// mask bit is zero, the corresponding value from vector \a a is gathered;
4443/// otherwise the value is loaded from memory.
4444/// \param s
4445/// A literal constant scale factor for the indexes in \a i. Must be
4446/// 1, 2, 4, or 8.
4447/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4448#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4449 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4450 (int const *)(m), \
4451 (__v4di)(__m256i)(i), \
4452 (__v4si)(__m128i)(mask), (s)))
4453
4454/// Conditionally gathers two 64-bit integer values, either from the
4455/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4456/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4457/// of [2 x i64] in \a mask determines the source for each element.
4458///
4459/// \code{.operation}
4460/// FOR element := 0 to 1
4461/// j := element*64
4462/// k := element*32
4463/// IF mask[j+63] == 0
4464/// result[j+63:j] := a[j+63:j]
4465/// ELSE
4466/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4467/// FI
4468/// ENDFOR
4469/// \endcode
4470///
4471/// \headerfile <immintrin.h>
4472///
4473/// \code
4474/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4475/// __m128i mask, const int s);
4476/// \endcode
4477///
4478/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4479///
4480/// \param a
4481/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4482/// zero.
4483/// \param m
4484/// A pointer to the memory used for loading values.
4485/// \param i
4486/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4487/// the first two elements are used.
4488/// \param mask
4489/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4490/// bit of each element in the mask vector represents the mask bits. If a
4491/// mask bit is zero, the corresponding value from vector \a a is gathered;
4492/// otherwise the value is loaded from memory.
4493/// \param s
4494/// A literal constant scale factor for the indexes in \a i. Must be
4495/// 1, 2, 4, or 8.
4496/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4497#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4498 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4499 (long long const *)(m), \
4500 (__v4si)(__m128i)(i), \
4501 (__v2di)(__m128i)(mask), (s)))
4502
4503/// Conditionally gathers four 64-bit integer values, either from the
4504/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4505/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4506/// of [4 x i64] in \a mask determines the source for each element.
4507///
4508/// \code{.operation}
4509/// FOR element := 0 to 3
4510/// j := element*64
4511/// k := element*32
4512/// IF mask[j+63] == 0
4513/// result[j+63:j] := a[j+63:j]
4514/// ELSE
4515/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4516/// FI
4517/// ENDFOR
4518/// \endcode
4519///
4520/// \headerfile <immintrin.h>
4521///
4522/// \code
4523/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4524/// __m128i i, __m256i mask, const int s);
4525/// \endcode
4526///
4527/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4528///
4529/// \param a
4530/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4531/// zero.
4532/// \param m
4533/// A pointer to the memory used for loading values.
4534/// \param i
4535/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4536/// \param mask
4537/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4538/// bit of each element in the mask vector represents the mask bits. If a
4539/// mask bit is zero, the corresponding value from vector \a a is gathered;
4540/// otherwise the value is loaded from memory.
4541/// \param s
4542/// A literal constant scale factor for the indexes in \a i. Must be
4543/// 1, 2, 4, or 8.
4544/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4545#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4546 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4547 (long long const *)(m), \
4548 (__v4si)(__m128i)(i), \
4549 (__v4di)(__m256i)(mask), (s)))
4550
4551/// Conditionally gathers two 64-bit integer values, either from the
4552/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4553/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4554/// of [2 x i64] in \a mask determines the source for each element.
4555///
4556/// \code{.operation}
4557/// FOR element := 0 to 1
4558/// j := element*64
4559/// k := element*64
4560/// IF mask[j+63] == 0
4561/// result[j+63:j] := a[j+63:j]
4562/// ELSE
4563/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4564/// FI
4565/// ENDFOR
4566/// \endcode
4567///
4568/// \headerfile <immintrin.h>
4569///
4570/// \code
4571/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4572/// __m128i mask, const int s);
4573/// \endcode
4574///
4575/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4576///
4577/// \param a
4578/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4579/// zero.
4580/// \param m
4581/// A pointer to the memory used for loading values.
4582/// \param i
4583/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4584/// \param mask
4585/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4586/// bit of each element in the mask vector represents the mask bits. If a
4587/// mask bit is zero, the corresponding value from vector \a a is gathered;
4588/// otherwise the value is loaded from memory.
4589/// \param s
4590/// A literal constant scale factor for the indexes in \a i. Must be
4591/// 1, 2, 4, or 8.
4592/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4593#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4594 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4595 (long long const *)(m), \
4596 (__v2di)(__m128i)(i), \
4597 (__v2di)(__m128i)(mask), (s)))
4598
4599/// Conditionally gathers four 64-bit integer values, either from the
4600/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4601/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4602/// of [4 x i64] in \a mask determines the source for each element.
4603///
4604/// \code{.operation}
4605/// FOR element := 0 to 3
4606/// j := element*64
4607/// k := element*64
4608/// IF mask[j+63] == 0
4609/// result[j+63:j] := a[j+63:j]
4610/// ELSE
4611/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4612/// FI
4613/// ENDFOR
4614/// \endcode
4615///
4616/// \headerfile <immintrin.h>
4617///
4618/// \code
4619/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4620/// __m256i i, __m256i mask, const int s);
4621/// \endcode
4622///
4623/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4624///
4625/// \param a
4626/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4627/// zero.
4628/// \param m
4629/// A pointer to the memory used for loading values.
4630/// \param i
4631/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4632/// \param mask
4633/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4634/// bit of each element in the mask vector represents the mask bits. If a
4635/// mask bit is zero, the corresponding value from vector \a a is gathered;
4636/// otherwise the value is loaded from memory.
4637/// \param s
4638/// A literal constant scale factor for the indexes in \a i. Must be
4639/// 1, 2, 4, or 8.
4640/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4641#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4642 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4643 (long long const *)(m), \
4644 (__v4di)(__m256i)(i), \
4645 (__v4di)(__m256i)(mask), (s)))
4646
4647/// Gathers two 64-bit floating-point values from memory \a m using scaled
4648/// indexes from the 128-bit vector of [4 x i32] in \a i.
4649///
4650/// \code{.operation}
4651/// FOR element := 0 to 1
4652/// j := element*64
4653/// k := element*32
4654/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4655/// ENDFOR
4656/// \endcode
4657///
4658/// \headerfile <immintrin.h>
4659///
4660/// \code
4661/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4662/// \endcode
4663///
4664/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4665///
4666/// \param m
4667/// A pointer to the memory used for loading values.
4668/// \param i
4669/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4670/// the first two elements are used.
4671/// \param s
4672/// A literal constant scale factor for the indexes in \a i. Must be
4673/// 1, 2, 4, or 8.
4674/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4675#define _mm_i32gather_pd(m, i, s) \
4676 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4677 (double const *)(m), \
4678 (__v4si)(__m128i)(i), \
4679 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4680 _mm_setzero_pd()), \
4681 (s)))
4682
4683/// Gathers four 64-bit floating-point values from memory \a m using scaled
4684/// indexes from the 128-bit vector of [4 x i32] in \a i.
4685///
4686/// \code{.operation}
4687/// FOR element := 0 to 3
4688/// j := element*64
4689/// k := element*32
4690/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4691/// ENDFOR
4692/// \endcode
4693///
4694/// \headerfile <immintrin.h>
4695///
4696/// \code
4697/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4698/// \endcode
4699///
4700/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4701///
4702/// \param m
4703/// A pointer to the memory used for loading values.
4704/// \param i
4705/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4706/// \param s
4707/// A literal constant scale factor for the indexes in \a i. Must be
4708/// 1, 2, 4, or 8.
4709/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4710#define _mm256_i32gather_pd(m, i, s) \
4711 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4712 (double const *)(m), \
4713 (__v4si)(__m128i)(i), \
4714 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4715 _mm256_setzero_pd(), \
4716 _CMP_EQ_OQ), \
4717 (s)))
4718
4719/// Gathers two 64-bit floating-point values from memory \a m using scaled
4720/// indexes from the 128-bit vector of [2 x i64] in \a i.
4721///
4722/// \code{.operation}
4723/// FOR element := 0 to 1
4724/// j := element*64
4725/// k := element*64
4726/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4727/// ENDFOR
4728/// \endcode
4729///
4730/// \headerfile <immintrin.h>
4731///
4732/// \code
4733/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4734/// \endcode
4735///
4736/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4737///
4738/// \param m
4739/// A pointer to the memory used for loading values.
4740/// \param i
4741/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4742/// \param s
4743/// A literal constant scale factor for the indexes in \a i. Must be
4744/// 1, 2, 4, or 8.
4745/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4746#define _mm_i64gather_pd(m, i, s) \
4747 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4748 (double const *)(m), \
4749 (__v2di)(__m128i)(i), \
4750 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4751 _mm_setzero_pd()), \
4752 (s)))
4753
4754/// Gathers four 64-bit floating-point values from memory \a m using scaled
4755/// indexes from the 256-bit vector of [4 x i64] in \a i.
4756///
4757/// \code{.operation}
4758/// FOR element := 0 to 3
4759/// j := element*64
4760/// k := element*64
4761/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4762/// ENDFOR
4763/// \endcode
4764///
4765/// \headerfile <immintrin.h>
4766///
4767/// \code
4768/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4769/// \endcode
4770///
4771/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4772///
4773/// \param m
4774/// A pointer to the memory used for loading values.
4775/// \param i
4776/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4777/// \param s
4778/// A literal constant scale factor for the indexes in \a i. Must be
4779/// 1, 2, 4, or 8.
4780/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4781#define _mm256_i64gather_pd(m, i, s) \
4782 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4783 (double const *)(m), \
4784 (__v4di)(__m256i)(i), \
4785 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4786 _mm256_setzero_pd(), \
4787 _CMP_EQ_OQ), \
4788 (s)))
4789
4790/// Gathers four 32-bit floating-point values from memory \a m using scaled
4791/// indexes from the 128-bit vector of [4 x i32] in \a i.
4792///
4793/// \code{.operation}
4794/// FOR element := 0 to 3
4795/// j := element*32
4796/// k := element*32
4797/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4798/// ENDFOR
4799/// \endcode
4800///
4801/// \headerfile <immintrin.h>
4802///
4803/// \code
4804/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4805/// \endcode
4806///
4807/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4808///
4809/// \param m
4810/// A pointer to the memory used for loading values.
4811/// \param i
4812/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4813/// \param s
4814/// A literal constant scale factor for the indexes in \a i. Must be
4815/// 1, 2, 4, or 8.
4816/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4817#define _mm_i32gather_ps(m, i, s) \
4818 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4819 (float const *)(m), \
4820 (__v4si)(__m128i)(i), \
4821 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4822 _mm_setzero_ps()), \
4823 (s)))
4824
4825/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4826/// indexes from the 256-bit vector of [8 x i32] in \a i.
4827///
4828/// \code{.operation}
4829/// FOR element := 0 to 7
4830/// j := element*32
4831/// k := element*32
4832/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4833/// ENDFOR
4834/// \endcode
4835///
4836/// \headerfile <immintrin.h>
4837///
4838/// \code
4839/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4840/// \endcode
4841///
4842/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4843///
4844/// \param m
4845/// A pointer to the memory used for loading values.
4846/// \param i
4847/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4848/// \param s
4849/// A literal constant scale factor for the indexes in \a i. Must be
4850/// 1, 2, 4, or 8.
4851/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4852#define _mm256_i32gather_ps(m, i, s) \
4853 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4854 (float const *)(m), \
4855 (__v8si)(__m256i)(i), \
4856 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4857 _mm256_setzero_ps(), \
4858 _CMP_EQ_OQ), \
4859 (s)))
4860
4861/// Gathers two 32-bit floating-point values from memory \a m using scaled
4862/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4863/// elements of the result are zeroed.
4864///
4865/// \code{.operation}
4866/// FOR element := 0 to 1
4867/// j := element*32
4868/// k := element*64
4869/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4870/// ENDFOR
4871/// result[127:64] := 0
4872/// \endcode
4873///
4874/// \headerfile <immintrin.h>
4875///
4876/// \code
4877/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4878/// \endcode
4879///
4880/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4881///
4882/// \param m
4883/// A pointer to the memory used for loading values.
4884/// \param i
4885/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4886/// \param s
4887/// A literal constant scale factor for the indexes in \a i. Must be
4888/// 1, 2, 4, or 8.
4889/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4890#define _mm_i64gather_ps(m, i, s) \
4891 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4892 (float const *)(m), \
4893 (__v2di)(__m128i)(i), \
4894 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4895 _mm_setzero_ps()), \
4896 (s)))
4897
4898/// Gathers four 32-bit floating-point values from memory \a m using scaled
4899/// indexes from the 256-bit vector of [4 x i64] in \a i.
4900///
4901/// \code{.operation}
4902/// FOR element := 0 to 3
4903/// j := element*32
4904/// k := element*64
4905/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4906/// ENDFOR
4907/// \endcode
4908///
4909/// \headerfile <immintrin.h>
4910///
4911/// \code
4912/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4913/// \endcode
4914///
4915/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4916///
4917/// \param m
4918/// A pointer to the memory used for loading values.
4919/// \param i
4920/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4921/// \param s
4922/// A literal constant scale factor for the indexes in \a i. Must be
4923/// 1, 2, 4, or 8.
4924/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4925#define _mm256_i64gather_ps(m, i, s) \
4926 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
4927 (float const *)(m), \
4928 (__v4di)(__m256i)(i), \
4929 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4930 _mm_setzero_ps()), \
4931 (s)))
4932
4933/// Gathers four 32-bit floating-point values from memory \a m using scaled
4934/// indexes from the 128-bit vector of [4 x i32] in \a i.
4935///
4936/// \code{.operation}
4937/// FOR element := 0 to 3
4938/// j := element*32
4939/// k := element*32
4940/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4941/// ENDFOR
4942/// \endcode
4943///
4944/// \headerfile <immintrin.h>
4945///
4946/// \code
4947/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
4948/// \endcode
4949///
4950/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4951///
4952/// \param m
4953/// A pointer to the memory used for loading values.
4954/// \param i
4955/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4956/// \param s
4957/// A literal constant scale factor for the indexes in \a i. Must be
4958/// 1, 2, 4, or 8.
4959/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4960#define _mm_i32gather_epi32(m, i, s) \
4961 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
4962 (int const *)(m), (__v4si)(__m128i)(i), \
4963 (__v4si)_mm_set1_epi32(-1), (s)))
4964
4965/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4966/// indexes from the 256-bit vector of [8 x i32] in \a i.
4967///
4968/// \code{.operation}
4969/// FOR element := 0 to 7
4970/// j := element*32
4971/// k := element*32
4972/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4973/// ENDFOR
4974/// \endcode
4975///
4976/// \headerfile <immintrin.h>
4977///
4978/// \code
4979/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
4980/// \endcode
4981///
4982/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4983///
4984/// \param m
4985/// A pointer to the memory used for loading values.
4986/// \param i
4987/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4988/// \param s
4989/// A literal constant scale factor for the indexes in \a i. Must be
4990/// 1, 2, 4, or 8.
4991/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4992#define _mm256_i32gather_epi32(m, i, s) \
4993 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
4994 (int const *)(m), (__v8si)(__m256i)(i), \
4995 (__v8si)_mm256_set1_epi32(-1), (s)))
4996
4997/// Gathers two 32-bit integer values from memory \a m using scaled indexes
4998/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
4999/// of the result are zeroed.
5000///
5001/// \code{.operation}
5002/// FOR element := 0 to 1
5003/// j := element*32
5004/// k := element*64
5005/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5006/// ENDFOR
5007/// result[127:64] := 0
5008/// \endcode
5009///
5010/// \headerfile <immintrin.h>
5011///
5012/// \code
5013/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5014/// \endcode
5015///
5016/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5017///
5018/// \param m
5019/// A pointer to the memory used for loading values.
5020/// \param i
5021/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5022/// \param s
5023/// A literal constant scale factor for the indexes in \a i. Must be
5024/// 1, 2, 4, or 8.
5025/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5026#define _mm_i64gather_epi32(m, i, s) \
5027 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5028 (int const *)(m), (__v2di)(__m128i)(i), \
5029 (__v4si)_mm_set1_epi32(-1), (s)))
5030
5031/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5032/// from the 256-bit vector of [4 x i64] in \a i.
5033///
5034/// \code{.operation}
5035/// FOR element := 0 to 3
5036/// j := element*32
5037/// k := element*64
5038/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5039/// ENDFOR
5040/// \endcode
5041///
5042/// \headerfile <immintrin.h>
5043///
5044/// \code
5045/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5046/// \endcode
5047///
5048/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5049///
5050/// \param m
5051/// A pointer to the memory used for loading values.
5052/// \param i
5053/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5054/// \param s
5055/// A literal constant scale factor for the indexes in \a i. Must be
5056/// 1, 2, 4, or 8.
5057/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5058#define _mm256_i64gather_epi32(m, i, s) \
5059 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5060 (int const *)(m), (__v4di)(__m256i)(i), \
5061 (__v4si)_mm_set1_epi32(-1), (s)))
5062
5063/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5064/// from the 128-bit vector of [4 x i32] in \a i.
5065///
5066/// \code{.operation}
5067/// FOR element := 0 to 1
5068/// j := element*64
5069/// k := element*32
5070/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5071/// ENDFOR
5072/// \endcode
5073///
5074/// \headerfile <immintrin.h>
5075///
5076/// \code
5077/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5078/// \endcode
5079///
5080/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5081///
5082/// \param m
5083/// A pointer to the memory used for loading values.
5084/// \param i
5085/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5086/// the first two elements are used.
5087/// \param s
5088/// A literal constant scale factor for the indexes in \a i. Must be
5089/// 1, 2, 4, or 8.
5090/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5091#define _mm_i32gather_epi64(m, i, s) \
5092 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5093 (long long const *)(m), \
5094 (__v4si)(__m128i)(i), \
5095 (__v2di)_mm_set1_epi64x(-1), (s)))
5096
5097/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5098/// from the 128-bit vector of [4 x i32] in \a i.
5099///
5100/// \code{.operation}
5101/// FOR element := 0 to 3
5102/// j := element*64
5103/// k := element*32
5104/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5105/// ENDFOR
5106/// \endcode
5107///
5108/// \headerfile <immintrin.h>
5109///
5110/// \code
5111/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5112/// \endcode
5113///
5114/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5115///
5116/// \param m
5117/// A pointer to the memory used for loading values.
5118/// \param i
5119/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5120/// \param s
5121/// A literal constant scale factor for the indexes in \a i. Must be
5122/// 1, 2, 4, or 8.
5123/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5124#define _mm256_i32gather_epi64(m, i, s) \
5125 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5126 (long long const *)(m), \
5127 (__v4si)(__m128i)(i), \
5128 (__v4di)_mm256_set1_epi64x(-1), (s)))
5129
5130/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5131/// from the 128-bit vector of [2 x i64] in \a i.
5132///
5133/// \code{.operation}
5134/// FOR element := 0 to 1
5135/// j := element*64
5136/// k := element*64
5137/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5138/// ENDFOR
5139/// \endcode
5140///
5141/// \headerfile <immintrin.h>
5142///
5143/// \code
5144/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5145/// \endcode
5146///
5147/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5148///
5149/// \param m
5150/// A pointer to the memory used for loading values.
5151/// \param i
5152/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5153/// \param s
5154/// A literal constant scale factor for the indexes in \a i. Must be
5155/// 1, 2, 4, or 8.
5156/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5157#define _mm_i64gather_epi64(m, i, s) \
5158 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5159 (long long const *)(m), \
5160 (__v2di)(__m128i)(i), \
5161 (__v2di)_mm_set1_epi64x(-1), (s)))
5162
5163/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5164/// from the 256-bit vector of [4 x i64] in \a i.
5165///
5166/// \code{.operation}
5167/// FOR element := 0 to 3
5168/// j := element*64
5169/// k := element*64
5170/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5171/// ENDFOR
5172/// \endcode
5173///
5174/// \headerfile <immintrin.h>
5175///
5176/// \code
5177/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5178/// \endcode
5179///
5180/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5181///
5182/// \param m
5183/// A pointer to the memory used for loading values.
5184/// \param i
5185/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5186/// \param s
5187/// A literal constant scale factor for the indexes in \a i. Must be
5188/// 1, 2, 4, or 8.
5189/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5190#define _mm256_i64gather_epi64(m, i, s) \
5191 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5192 (long long const *)(m), \
5193 (__v4di)(__m256i)(i), \
5194 (__v4di)_mm256_set1_epi64x(-1), (s)))
5195
5196#undef __DEFAULT_FN_ATTRS256
5197#undef __DEFAULT_FN_ATTRS128
5198#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
5199#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5200
5201#endif /* __AVX2INTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
Definition altivec.h:578
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi32_epi64(__m128i __V)
Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ int __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_movemask_epi8(__m256i __a)
Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vecto...
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256(const void *__V)
Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vec...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsubs_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than an...
Definition avx2intrin.h:722
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epu32(__m256i __a, __m256i __b)
Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srav_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_andnot_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit int...
Definition avx2intrin.h:466
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation...
Definition avx2intrin.h:386
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maddubs_epi16(__m256i __a, __m256i __b)
Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed by...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64(long long const *__X, __m256i __M)
Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to mem...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epu16(__m256i __a, __m256i __b)
Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upp...
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastsd_pd(__m128d __a)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32(int const *__X, __m256i __M)
Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi16(__m128i __V)
Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsi...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packus_epi32(__m256i __V1, __m256i __V2)
Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation,...
Definition avx2intrin.h:261
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi32(__m256i __a)
Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a a...
Definition avx2intrin.h:139
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srav_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi64(__m128i __V)
Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and r...
Definition avx2intrin.h:670
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed sa...
Definition avx2intrin.h:368
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits spec...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sign_epi32(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memor...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu32_epi64(__m128i __V)
Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi32(__m128i __V)
Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x f...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_shuffle_epi8(__m256i __a, __m256i __b)
Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi16(__m128i __V)
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the ...
Definition avx2intrin.h:551
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and r...
Definition avx2intrin.h:696
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32(int const *__X, __m128i __M)
Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsub_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and r...
Definition avx2intrin.h:938
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greate...
Definition avx2intrin.h:750
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsub_epi32(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and re...
Definition avx2intrin.h:969
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 b...
Definition avx2intrin.h:279
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi32(__m128i __V)
Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result'...
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadd_epi32(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and ret...
Definition avx2intrin.h:869
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater...
Definition avx2intrin.h:776
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi32(__m256i __a, __m256i __b)
Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned satur...
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using sign...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns t...
Definition avx2intrin.h:618
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sign_epi16(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in _...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi8(__m256i __a)
Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each va...
Definition avx2intrin.h:107
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi64(__m128i __V)
Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi64(__m128i __V)
Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadds_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using ...
Definition avx2intrin.h:903
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi64(__m256i __a, __m256i __b)
Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the ...
Definition avx2intrin.h:333
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi64(__m128i __V)
Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_and_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vectors in __a and __b.
Definition avx2intrin.h:448
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsi128_si256(__m128i __X)
Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi32(__m256i __a, __m256i __b)
Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the ...
Definition avx2intrin.h:315
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packus_epi16(__m256i __a, __m256i __b)
Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation,...
Definition avx2intrin.h:230
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi16(__m256i __a)
Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a ...
Definition avx2intrin.h:123
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the...
Definition avx2intrin.h:297
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_or_si256(__m256i __a, __m256i __b)
Computes the bitwise OR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsd_pd(__m128d __X)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi32(__m128i __V)
Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadd_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and re...
Definition avx2intrin.h:838
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu16(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16...
Definition avx2intrin.h:517
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_xor_si256(__m256i __a, __m256i __b)
Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sign_epi8(__m256i __a, __m256i __b)
Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi32(__m128i __V)
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned ...
Definition avx2intrin.h:403
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64(long long const *__X, __m128i __M)
Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the c...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi64(__m256i __a, __m256i __b)
Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_madd_epi16(__m256i __a, __m256i __b)
Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit interme...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturat...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packs_epi32(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation,...
Definition avx2intrin.h:200
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater...
Definition avx2intrin.h:802
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a, __m256i __b)
Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers fr...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation,...
Definition avx2intrin.h:351
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and ...
Definition avx2intrin.h:644
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packs_epi16(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation,...
Definition avx2intrin.h:169
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu8(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a a...
Definition avx2intrin.h:492
static __inline__ void int __a
Definition emmintrin.h:4077
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19