clang 22.0.0git
avx2intrin.h
Go to the documentation of this file.
1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX2INTRIN_H
15#define __AVX2INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
20 __min_vector_width__(256)))
21#define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
23 __min_vector_width__(128)))
24
25#if defined(__cplusplus) && (__cplusplus >= 201103L)
26#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
27#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
28#else
29#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
30#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
31#endif
32
33/* SSE4 Multiple Packed Sums of Absolute Difference. */
34/// Computes sixteen sum of absolute difference (SAD) operations on sets of
35/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
36/// \a Y.
37///
38/// Eight SAD results are computed using the lower half of the input
39/// vectors, and another eight using the upper half. These 16-bit values
40/// are returned in the lower and upper halves of the 256-bit result,
41/// respectively.
42///
43/// A single SAD operation selects four bytes from \a X and four bytes from
44/// \a Y as input. It computes the differences between each \a X byte and
45/// the corresponding \a Y byte, takes the absolute value of each
46/// difference, and sums these four values to form one 16-bit result. The
47/// intrinsic computes 16 of these results with different sets of input
48/// bytes.
49///
50/// For each set of eight results, the SAD operations use the same four
51/// bytes from \a Y; the starting bit position for these four bytes is
52/// specified by \a M[1:0] times 32. The eight operations use successive
53/// sets of four bytes from \a X; the starting bit position for the first
54/// set of four bytes is specified by \a M[2] times 32. These bit positions
55/// are all relative to the 128-bit lane for each set of eight operations.
56///
57/// \code{.operation}
58/// r := 0
59/// FOR i := 0 TO 1
60/// j := i*3
61/// Ybase := M[j+1:j]*32 + i*128
62/// Xbase := M[j+2]*32 + i*128
63/// FOR k := 0 TO 3
64/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
65/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
66/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
67/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
68/// result[r+15:r] := temp0 + temp1 + temp2 + temp3
69/// Xbase := Xbase + 8
70/// r := r + 16
71/// ENDFOR
72/// ENDFOR
73/// \endcode
74///
75/// \headerfile <immintrin.h>
76///
77/// \code
78/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
79/// \endcode
80///
81/// This intrinsic corresponds to the \c VMPSADBW instruction.
82///
83/// \param X
84/// A 256-bit integer vector containing one of the inputs.
85/// \param Y
86/// A 256-bit integer vector containing one of the inputs.
87/// \param M
88/// An unsigned immediate value specifying the starting positions of the
89/// bytes to operate on.
90/// \returns A 256-bit vector of [16 x i16] containing the result.
91#define _mm256_mpsadbw_epu8(X, Y, M) \
92 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
93 (__v32qi)(__m256i)(Y), (int)(M)))
94
95/// Computes the absolute value of each signed byte in the 256-bit integer
96/// vector \a __a and returns each value in the corresponding byte of
97/// the result.
98///
99/// \headerfile <immintrin.h>
100///
101/// This intrinsic corresponds to the \c VPABSB instruction.
102///
103/// \param __a
104/// A 256-bit integer vector.
105/// \returns A 256-bit integer vector containing the result.
106static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
108 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
109}
110
111/// Computes the absolute value of each signed 16-bit element in the 256-bit
112/// vector of [16 x i16] in \a __a and returns each value in the
113/// corresponding element of the result.
114///
115/// \headerfile <immintrin.h>
116///
117/// This intrinsic corresponds to the \c VPABSW instruction.
118///
119/// \param __a
120/// A 256-bit vector of [16 x i16].
121/// \returns A 256-bit vector of [16 x i16] containing the result.
122static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
124 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
125}
126
127/// Computes the absolute value of each signed 32-bit element in the 256-bit
128/// vector of [8 x i32] in \a __a and returns each value in the
129/// corresponding element of the result.
130///
131/// \headerfile <immintrin.h>
132///
133/// This intrinsic corresponds to the \c VPABSD instruction.
134///
135/// \param __a
136/// A 256-bit vector of [8 x i32].
137/// \returns A 256-bit vector of [8 x i32] containing the result.
138static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
140 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
141}
142
143/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
144/// integers using signed saturation, and returns the 256-bit result.
145///
146/// \code{.operation}
147/// FOR i := 0 TO 7
148/// j := i*16
149/// k := i*8
150/// result[7+k:k] := SATURATE8(__a[15+j:j])
151/// result[71+k:64+k] := SATURATE8(__b[15+j:j])
152/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
153/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
154/// ENDFOR
155/// \endcode
156///
157/// \headerfile <immintrin.h>
158///
159/// This intrinsic corresponds to the \c VPACKSSWB instruction.
160///
161/// \param __a
162/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
163/// result[191:128].
164/// \param __b
165/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
166/// result[255:192].
167/// \returns A 256-bit integer vector containing the result.
168static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
169_mm256_packs_epi16(__m256i __a, __m256i __b) {
170 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
171}
172
173/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
174/// integers using signed saturation, and returns the resulting 256-bit
175/// vector of [16 x i16].
176///
177/// \code{.operation}
178/// FOR i := 0 TO 3
179/// j := i*32
180/// k := i*16
181/// result[15+k:k] := SATURATE16(__a[31+j:j])
182/// result[79+k:64+k] := SATURATE16(__b[31+j:j])
183/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
184/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
185/// ENDFOR
186/// \endcode
187///
188/// \headerfile <immintrin.h>
189///
190/// This intrinsic corresponds to the \c VPACKSSDW instruction.
191///
192/// \param __a
193/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
194/// result[191:128].
195/// \param __b
196/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
197/// result[255:192].
198/// \returns A 256-bit vector of [16 x i16] containing the result.
199static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
200_mm256_packs_epi32(__m256i __a, __m256i __b) {
201 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
202}
203
204/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
205/// using unsigned saturation, and returns the 256-bit result.
206///
207/// \code{.operation}
208/// FOR i := 0 TO 7
209/// j := i*16
210/// k := i*8
211/// result[7+k:k] := SATURATE8U(__a[15+j:j])
212/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
213/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
214/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
215/// ENDFOR
216/// \endcode
217///
218/// \headerfile <immintrin.h>
219///
220/// This intrinsic corresponds to the \c VPACKUSWB instruction.
221///
222/// \param __a
223/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
224/// result[191:128].
225/// \param __b
226/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
227/// result[255:192].
228/// \returns A 256-bit integer vector containing the result.
229static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
230_mm256_packus_epi16(__m256i __a, __m256i __b) {
231 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
232}
233
234/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
235/// using unsigned saturation, and returns the resulting 256-bit vector of
236/// [16 x i16].
237///
238/// \code{.operation}
239/// FOR i := 0 TO 3
240/// j := i*32
241/// k := i*16
242/// result[15+k:k] := SATURATE16U(__V1[31+j:j])
243/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
244/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
245/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
246/// ENDFOR
247/// \endcode
248///
249/// \headerfile <immintrin.h>
250///
251/// This intrinsic corresponds to the \c VPACKUSDW instruction.
252///
253/// \param __V1
254/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
255/// result[191:128].
256/// \param __V2
257/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
258/// result[255:192].
259/// \returns A 256-bit vector of [16 x i16] containing the result.
260static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
261_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
262 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
263}
264
265/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
266/// vectors and returns the lower 8 bits of each sum in the corresponding
267/// byte of the 256-bit integer vector result (overflow is ignored).
268///
269/// \headerfile <immintrin.h>
270///
271/// This intrinsic corresponds to the \c VPADDB instruction.
272///
273/// \param __a
274/// A 256-bit integer vector containing one of the source operands.
275/// \param __b
276/// A 256-bit integer vector containing one of the source operands.
277/// \returns A 256-bit integer vector containing the sums.
278static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
279_mm256_add_epi8(__m256i __a, __m256i __b) {
280 return (__m256i)((__v32qu)__a + (__v32qu)__b);
281}
282
283/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284/// [16 x i16] and returns the lower 16 bits of each sum in the
285/// corresponding element of the [16 x i16] result (overflow is ignored).
286///
287/// \headerfile <immintrin.h>
288///
289/// This intrinsic corresponds to the \c VPADDW instruction.
290///
291/// \param __a
292/// A 256-bit vector of [16 x i16] containing one of the source operands.
293/// \param __b
294/// A 256-bit vector of [16 x i16] containing one of the source operands.
295/// \returns A 256-bit vector of [16 x i16] containing the sums.
296static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
297_mm256_add_epi16(__m256i __a, __m256i __b) {
298 return (__m256i)((__v16hu)__a + (__v16hu)__b);
299}
300
301/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
302/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
303/// element of the [8 x i32] result (overflow is ignored).
304///
305/// \headerfile <immintrin.h>
306///
307/// This intrinsic corresponds to the \c VPADDD instruction.
308///
309/// \param __a
310/// A 256-bit vector of [8 x i32] containing one of the source operands.
311/// \param __b
312/// A 256-bit vector of [8 x i32] containing one of the source operands.
313/// \returns A 256-bit vector of [8 x i32] containing the sums.
314static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
315_mm256_add_epi32(__m256i __a, __m256i __b) {
316 return (__m256i)((__v8su)__a + (__v8su)__b);
317}
318
319/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
320/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
321/// element of the [4 x i64] result (overflow is ignored).
322///
323/// \headerfile <immintrin.h>
324///
325/// This intrinsic corresponds to the \c VPADDQ instruction.
326///
327/// \param __a
328/// A 256-bit vector of [4 x i64] containing one of the source operands.
329/// \param __b
330/// A 256-bit vector of [4 x i64] containing one of the source operands.
331/// \returns A 256-bit vector of [4 x i64] containing the sums.
332static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
333_mm256_add_epi64(__m256i __a, __m256i __b) {
334 return (__m256i)((__v4du)__a + (__v4du)__b);
335}
336
337/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
338/// vectors using signed saturation, and returns each sum in the
339/// corresponding byte of the 256-bit integer vector result.
340///
341/// \headerfile <immintrin.h>
342///
343/// This intrinsic corresponds to the \c VPADDSB instruction.
344///
345/// \param __a
346/// A 256-bit integer vector containing one of the source operands.
347/// \param __b
348/// A 256-bit integer vector containing one of the source operands.
349/// \returns A 256-bit integer vector containing the sums.
350static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
351_mm256_adds_epi8(__m256i __a, __m256i __b) {
352 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
353}
354
355/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
356/// [16 x i16] using signed saturation, and returns the [16 x i16] result.
357///
358/// \headerfile <immintrin.h>
359///
360/// This intrinsic corresponds to the \c VPADDSW instruction.
361///
362/// \param __a
363/// A 256-bit vector of [16 x i16] containing one of the source operands.
364/// \param __b
365/// A 256-bit vector of [16 x i16] containing one of the source operands.
366/// \returns A 256-bit vector of [16 x i16] containing the sums.
367static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
368_mm256_adds_epi16(__m256i __a, __m256i __b) {
369 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
370}
371
372/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
373/// vectors using unsigned saturation, and returns each sum in the
374/// corresponding byte of the 256-bit integer vector result.
375///
376/// \headerfile <immintrin.h>
377///
378/// This intrinsic corresponds to the \c VPADDUSB instruction.
379///
380/// \param __a
381/// A 256-bit integer vector containing one of the source operands.
382/// \param __b
383/// A 256-bit integer vector containing one of the source operands.
384/// \returns A 256-bit integer vector containing the sums.
385static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
386_mm256_adds_epu8(__m256i __a, __m256i __b) {
387 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
388}
389
390/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
391/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
392///
393/// \headerfile <immintrin.h>
394///
395/// This intrinsic corresponds to the \c VPADDUSW instruction.
396///
397/// \param __a
398/// A 256-bit vector of [16 x i16] containing one of the source operands.
399/// \param __b
400/// A 256-bit vector of [16 x i16] containing one of the source operands.
401/// \returns A 256-bit vector of [16 x i16] containing the sums.
402static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
403_mm256_adds_epu16(__m256i __a, __m256i __b) {
404 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
405}
406
407/// Uses the lower half of the 256-bit vector \a a as the upper half of a
408/// temporary 256-bit value, and the lower half of the 256-bit vector \a b
409/// as the lower half of the temporary value. Right-shifts the temporary
410/// value by \a n bytes, and uses the lower 16 bytes of the shifted value
411/// as the lower 16 bytes of the result. Uses the upper halves of \a a and
412/// \a b to make another temporary value, right shifts by \a n, and uses
413/// the lower 16 bytes of the shifted value as the upper 16 bytes of the
414/// result.
415///
416/// \headerfile <immintrin.h>
417///
418/// \code
419/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
420/// \endcode
421///
422/// This intrinsic corresponds to the \c VPALIGNR instruction.
423///
424/// \param a
425/// A 256-bit integer vector containing source values.
426/// \param b
427/// A 256-bit integer vector containing source values.
428/// \param n
429/// An immediate value specifying the number of bytes to shift.
430/// \returns A 256-bit integer vector containing the result.
431#define _mm256_alignr_epi8(a, b, n) \
432 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
433 (__v32qi)(__m256i)(b), (n)))
434
435/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
436/// \a __b.
437///
438/// \headerfile <immintrin.h>
439///
440/// This intrinsic corresponds to the \c VPAND instruction.
441///
442/// \param __a
443/// A 256-bit integer vector.
444/// \param __b
445/// A 256-bit integer vector.
446/// \returns A 256-bit integer vector containing the result.
447static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
448_mm256_and_si256(__m256i __a, __m256i __b)
449{
450 return (__m256i)((__v4du)__a & (__v4du)__b);
451}
452
453/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
454/// the bitwise NOT of the 256-bit integer vector in \a __a.
455///
456/// \headerfile <immintrin.h>
457///
458/// This intrinsic corresponds to the \c VPANDN instruction.
459///
460/// \param __a
461/// A 256-bit integer vector.
462/// \param __b
463/// A 256-bit integer vector.
464/// \returns A 256-bit integer vector containing the result.
465static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
466_mm256_andnot_si256(__m256i __a, __m256i __b)
467{
468 return (__m256i)(~(__v4du)__a & (__v4du)__b);
469}
470
471/// Computes the averages of the corresponding unsigned bytes in the two
472/// 256-bit integer vectors in \a __a and \a __b and returns each
473/// average in the corresponding byte of the 256-bit result.
474///
475/// \code{.operation}
476/// FOR i := 0 TO 31
477/// j := i*8
478/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
479/// ENDFOR
480/// \endcode
481///
482/// \headerfile <immintrin.h>
483///
484/// This intrinsic corresponds to the \c VPAVGB instruction.
485///
486/// \param __a
487/// A 256-bit integer vector.
488/// \param __b
489/// A 256-bit integer vector.
490/// \returns A 256-bit integer vector containing the result.
491static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
492_mm256_avg_epu8(__m256i __a, __m256i __b) {
493 return (__m256i)__builtin_ia32_pavgb256((__v32qu)__a, (__v32qu)__b);
494}
495
496/// Computes the averages of the corresponding unsigned 16-bit integers in
497/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
498/// each average in the corresponding element of the 256-bit result.
499///
500/// \code{.operation}
501/// FOR i := 0 TO 15
502/// j := i*16
503/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
504/// ENDFOR
505/// \endcode
506///
507/// \headerfile <immintrin.h>
508///
509/// This intrinsic corresponds to the \c VPAVGW instruction.
510///
511/// \param __a
512/// A 256-bit vector of [16 x i16].
513/// \param __b
514/// A 256-bit vector of [16 x i16].
515/// \returns A 256-bit vector of [16 x i16] containing the result.
516static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
517_mm256_avg_epu16(__m256i __a, __m256i __b) {
518 return (__m256i)__builtin_ia32_pavgw256((__v16hu)__a, (__v16hu)__b);
519}
520
521/// Merges 8-bit integer values from either of the two 256-bit vectors
522/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
523/// the resulting 256-bit integer vector.
524///
525/// \code{.operation}
526/// FOR i := 0 TO 31
527/// j := i*8
528/// IF __M[7+i] == 0
529/// result[7+j:j] := __V1[7+j:j]
530/// ELSE
531/// result[7+j:j] := __V2[7+j:j]
532/// FI
533/// ENDFOR
534/// \endcode
535///
536/// \headerfile <immintrin.h>
537///
538/// This intrinsic corresponds to the \c VPBLENDVB instruction.
539///
540/// \param __V1
541/// A 256-bit integer vector containing source values.
542/// \param __V2
543/// A 256-bit integer vector containing source values.
544/// \param __M
545/// A 256-bit integer vector, with bit [7] of each byte specifying the
546/// source for each corresponding byte of the result. When the mask bit
547/// is 0, the byte is copied from \a __V1; otherwise, it is copied from
548/// \a __V2.
549/// \returns A 256-bit integer vector containing the result.
550static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
551_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) {
552 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
553 (__v32qi)__M);
554}
555
556/// Merges 16-bit integer values from either of the two 256-bit vectors
557/// \a V1 or \a V2, as specified by the immediate integer operand \a M,
558/// and returns the resulting 256-bit vector of [16 x i16].
559///
560/// \code{.operation}
561/// FOR i := 0 TO 7
562/// j := i*16
563/// IF M[i] == 0
564/// result[7+j:j] := V1[7+j:j]
565/// result[135+j:128+j] := V1[135+j:128+j]
566/// ELSE
567/// result[7+j:j] := V2[7+j:j]
568/// result[135+j:128+j] := V2[135+j:128+j]
569/// FI
570/// ENDFOR
571/// \endcode
572///
573/// \headerfile <immintrin.h>
574///
575/// \code
576/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
577/// \endcode
578///
579/// This intrinsic corresponds to the \c VPBLENDW instruction.
580///
581/// \param V1
582/// A 256-bit vector of [16 x i16] containing source values.
583/// \param V2
584/// A 256-bit vector of [16 x i16] containing source values.
585/// \param M
586/// An immediate 8-bit integer operand, with bits [7:0] specifying the
587/// source for each element of the result. The position of the mask bit
588/// corresponds to the index of a copied value. When a mask bit is 0, the
589/// element is copied from \a V1; otherwise, it is copied from \a V2.
590/// \a M[0] determines the source for elements 0 and 8, \a M[1] for
591/// elements 1 and 9, and so forth.
592/// \returns A 256-bit vector of [16 x i16] containing the result.
593#define _mm256_blend_epi16(V1, V2, M) \
594 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
595 (__v16hi)(__m256i)(V2), (int)(M)))
596
597/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
598/// \a __b for equality and returns the outcomes in the corresponding
599/// bytes of the 256-bit result.
600///
601/// \code{.operation}
602/// FOR i := 0 TO 31
603/// j := i*8
604/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
605/// ENDFOR
606/// \endcode
607///
608/// \headerfile <immintrin.h>
609///
610/// This intrinsic corresponds to the \c VPCMPEQB instruction.
611///
612/// \param __a
613/// A 256-bit integer vector containing one of the inputs.
614/// \param __b
615/// A 256-bit integer vector containing one of the inputs.
616/// \returns A 256-bit integer vector containing the result.
617static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
618_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
619{
620 return (__m256i)((__v32qi)__a == (__v32qi)__b);
621}
622
623/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
624/// \a __a and \a __b for equality and returns the outcomes in the
625/// corresponding elements of the 256-bit result.
626///
627/// \code{.operation}
628/// FOR i := 0 TO 15
629/// j := i*16
630/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
631/// ENDFOR
632/// \endcode
633///
634/// \headerfile <immintrin.h>
635///
636/// This intrinsic corresponds to the \c VPCMPEQW instruction.
637///
638/// \param __a
639/// A 256-bit vector of [16 x i16] containing one of the inputs.
640/// \param __b
641/// A 256-bit vector of [16 x i16] containing one of the inputs.
642/// \returns A 256-bit vector of [16 x i16] containing the result.
643static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
644_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
645{
646 return (__m256i)((__v16hi)__a == (__v16hi)__b);
647}
648
649/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
650/// \a __a and \a __b for equality and returns the outcomes in the
651/// corresponding elements of the 256-bit result.
652///
653/// \code{.operation}
654/// FOR i := 0 TO 7
655/// j := i*32
656/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
657/// ENDFOR
658/// \endcode
659///
660/// \headerfile <immintrin.h>
661///
662/// This intrinsic corresponds to the \c VPCMPEQD instruction.
663///
664/// \param __a
665/// A 256-bit vector of [8 x i32] containing one of the inputs.
666/// \param __b
667/// A 256-bit vector of [8 x i32] containing one of the inputs.
668/// \returns A 256-bit vector of [8 x i32] containing the result.
669static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
670_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
671{
672 return (__m256i)((__v8si)__a == (__v8si)__b);
673}
674
675/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
676/// \a __a and \a __b for equality and returns the outcomes in the
677/// corresponding elements of the 256-bit result.
678///
679/// \code{.operation}
680/// FOR i := 0 TO 3
681/// j := i*64
682/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
683/// ENDFOR
684/// \endcode
685///
686/// \headerfile <immintrin.h>
687///
688/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
689///
690/// \param __a
691/// A 256-bit vector of [4 x i64] containing one of the inputs.
692/// \param __b
693/// A 256-bit vector of [4 x i64] containing one of the inputs.
694/// \returns A 256-bit vector of [4 x i64] containing the result.
695static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
696_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
697{
698 return (__m256i)((__v4di)__a == (__v4di)__b);
699}
700
701/// Compares corresponding signed bytes in the 256-bit integer vectors in
702/// \a __a and \a __b for greater-than and returns the outcomes in the
703/// corresponding bytes of the 256-bit result.
704///
705/// \code{.operation}
706/// FOR i := 0 TO 31
707/// j := i*8
708/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
709/// ENDFOR
710/// \endcode
711///
712/// \headerfile <immintrin.h>
713///
714/// This intrinsic corresponds to the \c VPCMPGTB instruction.
715///
716/// \param __a
717/// A 256-bit integer vector containing one of the inputs.
718/// \param __b
719/// A 256-bit integer vector containing one of the inputs.
720/// \returns A 256-bit integer vector containing the result.
721static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
722_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
723{
724 /* This function always performs a signed comparison, but __v32qi is a char
725 which may be signed or unsigned, so use __v32qs. */
726 return (__m256i)((__v32qs)__a > (__v32qs)__b);
727}
728
729/// Compares corresponding signed elements in the 256-bit vectors of
730/// [16 x i16] in \a __a and \a __b for greater-than and returns the
731/// outcomes in the corresponding elements of the 256-bit result.
732///
733/// \code{.operation}
734/// FOR i := 0 TO 15
735/// j := i*16
736/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
737/// ENDFOR
738/// \endcode
739///
740/// \headerfile <immintrin.h>
741///
742/// This intrinsic corresponds to the \c VPCMPGTW instruction.
743///
744/// \param __a
745/// A 256-bit vector of [16 x i16] containing one of the inputs.
746/// \param __b
747/// A 256-bit vector of [16 x i16] containing one of the inputs.
748/// \returns A 256-bit vector of [16 x i16] containing the result.
749static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
750_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
751{
752 return (__m256i)((__v16hi)__a > (__v16hi)__b);
753}
754
755/// Compares corresponding signed elements in the 256-bit vectors of
756/// [8 x i32] in \a __a and \a __b for greater-than and returns the
757/// outcomes in the corresponding elements of the 256-bit result.
758///
759/// \code{.operation}
760/// FOR i := 0 TO 7
761/// j := i*32
762/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
763/// ENDFOR
764/// \endcode
765///
766/// \headerfile <immintrin.h>
767///
768/// This intrinsic corresponds to the \c VPCMPGTD instruction.
769///
770/// \param __a
771/// A 256-bit vector of [8 x i32] containing one of the inputs.
772/// \param __b
773/// A 256-bit vector of [8 x i32] containing one of the inputs.
774/// \returns A 256-bit vector of [8 x i32] containing the result.
775static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
776_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
777{
778 return (__m256i)((__v8si)__a > (__v8si)__b);
779}
780
781/// Compares corresponding signed elements in the 256-bit vectors of
782/// [4 x i64] in \a __a and \a __b for greater-than and returns the
783/// outcomes in the corresponding elements of the 256-bit result.
784///
785/// \code{.operation}
786/// FOR i := 0 TO 3
787/// j := i*64
788/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
789/// ENDFOR
790/// \endcode
791///
792/// \headerfile <immintrin.h>
793///
794/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
795///
796/// \param __a
797/// A 256-bit vector of [4 x i64] containing one of the inputs.
798/// \param __b
799/// A 256-bit vector of [4 x i64] containing one of the inputs.
800/// \returns A 256-bit vector of [4 x i64] containing the result.
801static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
802_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
803{
804 return (__m256i)((__v4di)__a > (__v4di)__b);
805}
806
807/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
808/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
809/// element of the [16 x i16] result (overflow is ignored). Sums from
810/// \a __a are returned in the lower 64 bits of each 128-bit half of the
811/// result; sums from \a __b are returned in the upper 64 bits of each
812/// 128-bit half of the result.
813///
814/// \code{.operation}
815/// FOR i := 0 TO 1
816/// j := i*128
817/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
818/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
819/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
820/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
821/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
822/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
823/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
824/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
825/// ENDFOR
826/// \endcode
827///
828/// \headerfile <immintrin.h>
829///
830/// This intrinsic corresponds to the \c VPHADDW instruction.
831///
832/// \param __a
833/// A 256-bit vector of [16 x i16] containing one of the source operands.
834/// \param __b
835/// A 256-bit vector of [16 x i16] containing one of the source operands.
836/// \returns A 256-bit vector of [16 x i16] containing the sums.
837static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
838_mm256_hadd_epi16(__m256i __a, __m256i __b) {
839 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
840}
841
842/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
843/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
844/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
845/// are returned in the lower 64 bits of each 128-bit half of the result;
846/// sums from \a __b are returned in the upper 64 bits of each 128-bit half
847/// of the result.
848///
849/// \code{.operation}
850/// FOR i := 0 TO 1
851/// j := i*128
852/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
853/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
854/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
855/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
856/// ENDFOR
857/// \endcode
858///
859/// \headerfile <immintrin.h>
860///
861/// This intrinsic corresponds to the \c VPHADDD instruction.
862///
863/// \param __a
864/// A 256-bit vector of [8 x i32] containing one of the source operands.
865/// \param __b
866/// A 256-bit vector of [8 x i32] containing one of the source operands.
867/// \returns A 256-bit vector of [8 x i32] containing the sums.
868static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
869_mm256_hadd_epi32(__m256i __a, __m256i __b) {
870 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
871}
872
873/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
874/// vectors of [16 x i16] using signed saturation and returns each sum in
875/// an element of the [16 x i16] result. Sums from \a __a are returned in
876/// the lower 64 bits of each 128-bit half of the result; sums from \a __b
877/// are returned in the upper 64 bits of each 128-bit half of the result.
878///
879/// \code{.operation}
880/// FOR i := 0 TO 1
881/// j := i*128
882/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
883/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
884/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
885/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
886/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
887/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
888/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
889/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
890/// ENDFOR
891/// \endcode
892///
893/// \headerfile <immintrin.h>
894///
895/// This intrinsic corresponds to the \c VPHADDSW instruction.
896///
897/// \param __a
898/// A 256-bit vector of [16 x i16] containing one of the source operands.
899/// \param __b
900/// A 256-bit vector of [16 x i16] containing one of the source operands.
901/// \returns A 256-bit vector of [16 x i16] containing the sums.
902static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
903_mm256_hadds_epi16(__m256i __a, __m256i __b) {
904 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
905}
906
907/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
908/// vectors of [16 x i16] and returns the lower 16 bits of each difference
909/// in an element of the [16 x i16] result (overflow is ignored).
910/// Differences from \a __a are returned in the lower 64 bits of each
911/// 128-bit half of the result; differences from \a __b are returned in the
912/// upper 64 bits of each 128-bit half of the result.
913///
914/// \code{.operation}
915/// FOR i := 0 TO 1
916/// j := i*128
917/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
918/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
919/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
920/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
921/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
922/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
923/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
924/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
925/// ENDFOR
926/// \endcode
927///
928/// \headerfile <immintrin.h>
929///
930/// This intrinsic corresponds to the \c VPHSUBW instruction.
931///
932/// \param __a
933/// A 256-bit vector of [16 x i16] containing one of the source operands.
934/// \param __b
935/// A 256-bit vector of [16 x i16] containing one of the source operands.
936/// \returns A 256-bit vector of [16 x i16] containing the differences.
937static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
938_mm256_hsub_epi16(__m256i __a, __m256i __b) {
939 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
940}
941
942/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
943/// vectors of [8 x i32] and returns the lower 32 bits of each difference in
944/// an element of the [8 x i32] result (overflow is ignored). Differences
945/// from \a __a are returned in the lower 64 bits of each 128-bit half of
946/// the result; differences from \a __b are returned in the upper 64 bits
947/// of each 128-bit half of the result.
948///
949/// \code{.operation}
950/// FOR i := 0 TO 1
951/// j := i*128
952/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
953/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
954/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
955/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
956/// ENDFOR
957/// \endcode
958///
959/// \headerfile <immintrin.h>
960///
961/// This intrinsic corresponds to the \c VPHSUBD instruction.
962///
963/// \param __a
964/// A 256-bit vector of [8 x i32] containing one of the source operands.
965/// \param __b
966/// A 256-bit vector of [8 x i32] containing one of the source operands.
967/// \returns A 256-bit vector of [8 x i32] containing the differences.
968static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
969_mm256_hsub_epi32(__m256i __a, __m256i __b) {
970 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
971}
972
973/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
974/// vectors of [16 x i16] using signed saturation and returns each sum in
975/// an element of the [16 x i16] result. Differences from \a __a are
976/// returned in the lower 64 bits of each 128-bit half of the result;
977/// differences from \a __b are returned in the upper 64 bits of each
978/// 128-bit half of the result.
979///
980/// \code{.operation}
981/// FOR i := 0 TO 1
982/// j := i*128
983/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
984/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
985/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
986/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
987/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
988/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
989/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
990/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
991/// ENDFOR
992/// \endcode
993///
994/// \headerfile <immintrin.h>
995///
996/// This intrinsic corresponds to the \c VPHSUBSW instruction.
997///
998/// \param __a
999/// A 256-bit vector of [16 x i16] containing one of the source operands.
1000/// \param __b
1001/// A 256-bit vector of [16 x i16] containing one of the source operands.
1002/// \returns A 256-bit vector of [16 x i16] containing the differences.
1003static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1004_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
1005 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1006}
1007
1008/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1009/// with the corresponding signed byte from the 256-bit integer vector in
1010/// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1011/// pairs of those products using signed saturation to form 16-bit sums
1012/// returned as elements of the [16 x i16] result.
1013///
1014/// \code{.operation}
1015/// FOR i := 0 TO 15
1016/// j := i*16
1017/// temp1 := __a[j+7:j] * __b[j+7:j]
1018/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1019/// result[j+15:j] := SATURATE16(temp1 + temp2)
1020/// ENDFOR
1021/// \endcode
1022///
1023/// \headerfile <immintrin.h>
1024///
1025/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1026///
1027/// \param __a
1028/// A 256-bit vector containing one of the source operands.
1029/// \param __b
1030/// A 256-bit vector containing one of the source operands.
1031/// \returns A 256-bit vector of [16 x i16] containing the result.
1032static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1033_mm256_maddubs_epi16(__m256i __a, __m256i __b) {
1034 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1035}
1036
1037/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1038/// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1039/// those products to form 32-bit sums returned as elements of the
1040/// [8 x i32] result.
1041///
1042/// There is only one wraparound case: when all four of the 16-bit sources
1043/// are \c 0x8000, the result will be \c 0x80000000.
1044///
1045/// \code{.operation}
1046/// FOR i := 0 TO 7
1047/// j := i*32
1048/// temp1 := __a[j+15:j] * __b[j+15:j]
1049/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1050/// result[j+31:j] := temp1 + temp2
1051/// ENDFOR
1052/// \endcode
1053///
1054/// \headerfile <immintrin.h>
1055///
1056/// This intrinsic corresponds to the \c VPMADDWD instruction.
1057///
1058/// \param __a
1059/// A 256-bit vector of [16 x i16] containing one of the source operands.
1060/// \param __b
1061/// A 256-bit vector of [16 x i16] containing one of the source operands.
1062/// \returns A 256-bit vector of [8 x i32] containing the result.
1063static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1064_mm256_madd_epi16(__m256i __a, __m256i __b) {
1065 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1066}
1067
1068/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1069/// in \a __a and \a __b and returns the larger of each pair in the
1070/// corresponding byte of the 256-bit result.
1071///
1072/// \headerfile <immintrin.h>
1073///
1074/// This intrinsic corresponds to the \c VPMAXSB instruction.
1075///
1076/// \param __a
1077/// A 256-bit integer vector.
1078/// \param __b
1079/// A 256-bit integer vector.
1080/// \returns A 256-bit integer vector containing the result.
1081static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1082_mm256_max_epi8(__m256i __a, __m256i __b) {
1083 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1084}
1085
1086/// Compares the corresponding signed 16-bit integers in the two 256-bit
1087/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1088/// each pair in the corresponding element of the 256-bit result.
1089///
1090/// \headerfile <immintrin.h>
1091///
1092/// This intrinsic corresponds to the \c VPMAXSW instruction.
1093///
1094/// \param __a
1095/// A 256-bit vector of [16 x i16].
1096/// \param __b
1097/// A 256-bit vector of [16 x i16].
1098/// \returns A 256-bit vector of [16 x i16] containing the result.
1099static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1100_mm256_max_epi16(__m256i __a, __m256i __b) {
1101 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1102}
1103
1104/// Compares the corresponding signed 32-bit integers in the two 256-bit
1105/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1106/// each pair in the corresponding element of the 256-bit result.
1107///
1108/// \headerfile <immintrin.h>
1109///
1110/// This intrinsic corresponds to the \c VPMAXSD instruction.
1111///
1112/// \param __a
1113/// A 256-bit vector of [8 x i32].
1114/// \param __b
1115/// A 256-bit vector of [8 x i32].
1116/// \returns A 256-bit vector of [8 x i32] containing the result.
1117static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1118_mm256_max_epi32(__m256i __a, __m256i __b) {
1119 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1120}
1121
1122/// Compares the corresponding unsigned bytes in the two 256-bit integer
1123/// vectors in \a __a and \a __b and returns the larger of each pair in
1124/// the corresponding byte of the 256-bit result.
1125///
1126/// \headerfile <immintrin.h>
1127///
1128/// This intrinsic corresponds to the \c VPMAXUB instruction.
1129///
1130/// \param __a
1131/// A 256-bit integer vector.
1132/// \param __b
1133/// A 256-bit integer vector.
1134/// \returns A 256-bit integer vector containing the result.
1135static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1136_mm256_max_epu8(__m256i __a, __m256i __b) {
1137 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1138}
1139
1140/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1141/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1142/// each pair in the corresponding element of the 256-bit result.
1143///
1144/// \headerfile <immintrin.h>
1145///
1146/// This intrinsic corresponds to the \c VPMAXUW instruction.
1147///
1148/// \param __a
1149/// A 256-bit vector of [16 x i16].
1150/// \param __b
1151/// A 256-bit vector of [16 x i16].
1152/// \returns A 256-bit vector of [16 x i16] containing the result.
1153static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1154_mm256_max_epu16(__m256i __a, __m256i __b) {
1155 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1156}
1157
1158/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1159/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1160/// each pair in the corresponding element of the 256-bit result.
1161///
1162/// \headerfile <immintrin.h>
1163///
1164/// This intrinsic corresponds to the \c VPMAXUD instruction.
1165///
1166/// \param __a
1167/// A 256-bit vector of [8 x i32].
1168/// \param __b
1169/// A 256-bit vector of [8 x i32].
1170/// \returns A 256-bit vector of [8 x i32] containing the result.
1171static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1172_mm256_max_epu32(__m256i __a, __m256i __b) {
1173 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1174}
1175
1176/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1177/// in \a __a and \a __b and returns the smaller of each pair in the
1178/// corresponding byte of the 256-bit result.
1179///
1180/// \headerfile <immintrin.h>
1181///
1182/// This intrinsic corresponds to the \c VPMINSB instruction.
1183///
1184/// \param __a
1185/// A 256-bit integer vector.
1186/// \param __b
1187/// A 256-bit integer vector.
1188/// \returns A 256-bit integer vector containing the result.
1189static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1190_mm256_min_epi8(__m256i __a, __m256i __b) {
1191 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1192}
1193
1194/// Compares the corresponding signed 16-bit integers in the two 256-bit
1195/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1196/// each pair in the corresponding element of the 256-bit result.
1197///
1198/// \headerfile <immintrin.h>
1199///
1200/// This intrinsic corresponds to the \c VPMINSW instruction.
1201///
1202/// \param __a
1203/// A 256-bit vector of [16 x i16].
1204/// \param __b
1205/// A 256-bit vector of [16 x i16].
1206/// \returns A 256-bit vector of [16 x i16] containing the result.
1207static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1208_mm256_min_epi16(__m256i __a, __m256i __b) {
1209 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1210}
1211
1212/// Compares the corresponding signed 32-bit integers in the two 256-bit
1213/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1214/// each pair in the corresponding element of the 256-bit result.
1215///
1216/// \headerfile <immintrin.h>
1217///
1218/// This intrinsic corresponds to the \c VPMINSD instruction.
1219///
1220/// \param __a
1221/// A 256-bit vector of [8 x i32].
1222/// \param __b
1223/// A 256-bit vector of [8 x i32].
1224/// \returns A 256-bit vector of [8 x i32] containing the result.
1225static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1226_mm256_min_epi32(__m256i __a, __m256i __b) {
1227 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1228}
1229
1230/// Compares the corresponding unsigned bytes in the two 256-bit integer
1231/// vectors in \a __a and \a __b and returns the smaller of each pair in
1232/// the corresponding byte of the 256-bit result.
1233///
1234/// \headerfile <immintrin.h>
1235///
1236/// This intrinsic corresponds to the \c VPMINUB instruction.
1237///
1238/// \param __a
1239/// A 256-bit integer vector.
1240/// \param __b
1241/// A 256-bit integer vector.
1242/// \returns A 256-bit integer vector containing the result.
1243static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1244_mm256_min_epu8(__m256i __a, __m256i __b) {
1245 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1246}
1247
1248/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1249/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1250/// each pair in the corresponding element of the 256-bit result.
1251///
1252/// \headerfile <immintrin.h>
1253///
1254/// This intrinsic corresponds to the \c VPMINUW instruction.
1255///
1256/// \param __a
1257/// A 256-bit vector of [16 x i16].
1258/// \param __b
1259/// A 256-bit vector of [16 x i16].
1260/// \returns A 256-bit vector of [16 x i16] containing the result.
1261static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1262_mm256_min_epu16(__m256i __a, __m256i __b) {
1263 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1264}
1265
1266/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1267/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1268/// each pair in the corresponding element of the 256-bit result.
1269///
1270/// \headerfile <immintrin.h>
1271///
1272/// This intrinsic corresponds to the \c VPMINUD instruction.
1273///
1274/// \param __a
1275/// A 256-bit vector of [8 x i32].
1276/// \param __b
1277/// A 256-bit vector of [8 x i32].
1278/// \returns A 256-bit vector of [8 x i32] containing the result.
1279static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1280_mm256_min_epu32(__m256i __a, __m256i __b) {
1281 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1282}
1283
1284/// Creates a 32-bit integer mask from the most significant bit of each byte
1285/// in the 256-bit integer vector in \a __a and returns the result.
1286///
1287/// \code{.operation}
1288/// FOR i := 0 TO 31
1289/// j := i*8
1290/// result[i] := __a[j+7]
1291/// ENDFOR
1292/// \endcode
1293///
1294/// \headerfile <immintrin.h>
1295///
1296/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1297///
1298/// \param __a
1299/// A 256-bit integer vector containing the source bytes.
1300/// \returns The 32-bit integer mask.
1301static __inline__ int __DEFAULT_FN_ATTRS256_CONSTEXPR
1303 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1304}
1305
1306/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1307/// the 16-bit values in the corresponding elements of a 256-bit vector
1308/// of [16 x i16].
1309///
1310/// \code{.operation}
1311/// FOR i := 0 TO 15
1312/// j := i*8
1313/// k := i*16
1314/// result[k+15:k] := SignExtend(__V[j+7:j])
1315/// ENDFOR
1316/// \endcode
1317///
1318/// \headerfile <immintrin.h>
1319///
1320/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1321///
1322/// \param __V
1323/// A 128-bit integer vector containing the source bytes.
1324/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1325/// values.
1326static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1328 /* This function always performs a signed extension, but __v16qi is a char
1329 which may be signed or unsigned, so use __v16qs. */
1330 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1331}
1332
1333/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1334/// \a __V and returns the 32-bit values in the corresponding elements of a
1335/// 256-bit vector of [8 x i32].
1336///
1337/// \code{.operation}
1338/// FOR i := 0 TO 7
1339/// j := i*8
1340/// k := i*32
1341/// result[k+31:k] := SignExtend(__V[j+7:j])
1342/// ENDFOR
1343/// \endcode
1344///
1345/// \headerfile <immintrin.h>
1346///
1347/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1348///
1349/// \param __V
1350/// A 128-bit integer vector containing the source bytes.
1351/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1352/// values.
1353static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1355 /* This function always performs a signed extension, but __v16qi is a char
1356 which may be signed or unsigned, so use __v16qs. */
1357 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1358}
1359
1360/// Sign-extends the first four bytes from the 128-bit integer vector in
1361/// \a __V and returns the 64-bit values in the corresponding elements of a
1362/// 256-bit vector of [4 x i64].
1363///
1364/// \code{.operation}
1365/// result[63:0] := SignExtend(__V[7:0])
1366/// result[127:64] := SignExtend(__V[15:8])
1367/// result[191:128] := SignExtend(__V[23:16])
1368/// result[255:192] := SignExtend(__V[31:24])
1369/// \endcode
1370///
1371/// \headerfile <immintrin.h>
1372///
1373/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1374///
1375/// \param __V
1376/// A 128-bit integer vector containing the source bytes.
1377/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1378/// values.
1379static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1381 /* This function always performs a signed extension, but __v16qi is a char
1382 which may be signed or unsigned, so use __v16qs. */
1383 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1384}
1385
1386/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1387/// \a __V and returns the 32-bit values in the corresponding elements of a
1388/// 256-bit vector of [8 x i32].
1389///
1390/// \code{.operation}
1391/// FOR i := 0 TO 7
1392/// j := i*16
1393/// k := i*32
1394/// result[k+31:k] := SignExtend(__V[j+15:j])
1395/// ENDFOR
1396/// \endcode
1397///
1398/// \headerfile <immintrin.h>
1399///
1400/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1401///
1402/// \param __V
1403/// A 128-bit vector of [8 x i16] containing the source values.
1404/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1405/// values.
1406static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1408 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1409}
1410
1411/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1412/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1413/// elements of a 256-bit vector of [4 x i64].
1414///
1415/// \code{.operation}
1416/// result[63:0] := SignExtend(__V[15:0])
1417/// result[127:64] := SignExtend(__V[31:16])
1418/// result[191:128] := SignExtend(__V[47:32])
1419/// result[255:192] := SignExtend(__V[64:48])
1420/// \endcode
1421///
1422/// \headerfile <immintrin.h>
1423///
1424/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1425///
1426/// \param __V
1427/// A 128-bit vector of [8 x i16] containing the source values.
1428/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1429/// values.
1430static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1432 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1433}
1434
1435/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1436/// \a __V and returns the 64-bit values in the corresponding elements of a
1437/// 256-bit vector of [4 x i64].
1438///
1439/// \code{.operation}
1440/// result[63:0] := SignExtend(__V[31:0])
1441/// result[127:64] := SignExtend(__V[63:32])
1442/// result[191:128] := SignExtend(__V[95:64])
1443/// result[255:192] := SignExtend(__V[127:96])
1444/// \endcode
1445///
1446/// \headerfile <immintrin.h>
1447///
1448/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1449///
1450/// \param __V
1451/// A 128-bit vector of [4 x i32] containing the source values.
1452/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1453/// values.
1454static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1456 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1457}
1458
1459/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1460/// the 16-bit values in the corresponding elements of a 256-bit vector
1461/// of [16 x i16].
1462///
1463/// \code{.operation}
1464/// FOR i := 0 TO 15
1465/// j := i*8
1466/// k := i*16
1467/// result[k+15:k] := ZeroExtend(__V[j+7:j])
1468/// ENDFOR
1469/// \endcode
1470///
1471/// \headerfile <immintrin.h>
1472///
1473/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1474///
1475/// \param __V
1476/// A 128-bit integer vector containing the source bytes.
1477/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1478/// values.
1479static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1481 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1482}
1483
1484/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1485/// \a __V and returns the 32-bit values in the corresponding elements of a
1486/// 256-bit vector of [8 x i32].
1487///
1488/// \code{.operation}
1489/// FOR i := 0 TO 7
1490/// j := i*8
1491/// k := i*32
1492/// result[k+31:k] := ZeroExtend(__V[j+7:j])
1493/// ENDFOR
1494/// \endcode
1495///
1496/// \headerfile <immintrin.h>
1497///
1498/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1499///
1500/// \param __V
1501/// A 128-bit integer vector containing the source bytes.
1502/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1503/// values.
1504static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1506 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1507}
1508
1509/// Zero-extends the first four bytes from the 128-bit integer vector in
1510/// \a __V and returns the 64-bit values in the corresponding elements of a
1511/// 256-bit vector of [4 x i64].
1512///
1513/// \code{.operation}
1514/// result[63:0] := ZeroExtend(__V[7:0])
1515/// result[127:64] := ZeroExtend(__V[15:8])
1516/// result[191:128] := ZeroExtend(__V[23:16])
1517/// result[255:192] := ZeroExtend(__V[31:24])
1518/// \endcode
1519///
1520/// \headerfile <immintrin.h>
1521///
1522/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1523///
1524/// \param __V
1525/// A 128-bit integer vector containing the source bytes.
1526/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1527/// values.
1528static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1530 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1531}
1532
1533/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1534/// \a __V and returns the 32-bit values in the corresponding elements of a
1535/// 256-bit vector of [8 x i32].
1536///
1537/// \code{.operation}
1538/// FOR i := 0 TO 7
1539/// j := i*16
1540/// k := i*32
1541/// result[k+31:k] := ZeroExtend(__V[j+15:j])
1542/// ENDFOR
1543/// \endcode
1544///
1545/// \headerfile <immintrin.h>
1546///
1547/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1548///
1549/// \param __V
1550/// A 128-bit vector of [8 x i16] containing the source values.
1551/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1552/// values.
1553static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1555 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1556}
1557
1558/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1559/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1560/// elements of a 256-bit vector of [4 x i64].
1561///
1562/// \code{.operation}
1563/// result[63:0] := ZeroExtend(__V[15:0])
1564/// result[127:64] := ZeroExtend(__V[31:16])
1565/// result[191:128] := ZeroExtend(__V[47:32])
1566/// result[255:192] := ZeroExtend(__V[64:48])
1567/// \endcode
1568///
1569/// \headerfile <immintrin.h>
1570///
1571/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1572///
1573/// \param __V
1574/// A 128-bit vector of [8 x i16] containing the source values.
1575/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1576/// values.
1577static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1579 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1580}
1581
1582/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1583/// \a __V and returns the 64-bit values in the corresponding elements of a
1584/// 256-bit vector of [4 x i64].
1585///
1586/// \code{.operation}
1587/// result[63:0] := ZeroExtend(__V[31:0])
1588/// result[127:64] := ZeroExtend(__V[63:32])
1589/// result[191:128] := ZeroExtend(__V[95:64])
1590/// result[255:192] := ZeroExtend(__V[127:96])
1591/// \endcode
1592///
1593/// \headerfile <immintrin.h>
1594///
1595/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1596///
1597/// \param __V
1598/// A 128-bit vector of [4 x i32] containing the source values.
1599/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1600/// values.
1601static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1603 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1604}
1605
1606/// Multiplies signed 32-bit integers from even-numbered elements of two
1607/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1608/// [4 x i64] result.
1609///
1610/// \code{.operation}
1611/// result[63:0] := __a[31:0] * __b[31:0]
1612/// result[127:64] := __a[95:64] * __b[95:64]
1613/// result[191:128] := __a[159:128] * __b[159:128]
1614/// result[255:192] := __a[223:192] * __b[223:192]
1615/// \endcode
1616///
1617/// \headerfile <immintrin.h>
1618///
1619/// This intrinsic corresponds to the \c VPMULDQ instruction.
1620///
1621/// \param __a
1622/// A 256-bit vector of [8 x i32] containing one of the source operands.
1623/// \param __b
1624/// A 256-bit vector of [8 x i32] containing one of the source operands.
1625/// \returns A 256-bit vector of [4 x i64] containing the products.
1626static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1627_mm256_mul_epi32(__m256i __a, __m256i __b) {
1628 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1629}
1630
1631/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1632/// [16 x i16], truncates the 32-bit results to the most significant 18
1633/// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1634/// product in the [16 x i16] result.
1635///
1636/// \code{.operation}
1637/// FOR i := 0 TO 15
1638/// j := i*16
1639/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1640/// result[j+15:j] := temp[16:1]
1641/// \endcode
1642///
1643/// \headerfile <immintrin.h>
1644///
1645/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1646///
1647/// \param __a
1648/// A 256-bit vector of [16 x i16] containing one of the source operands.
1649/// \param __b
1650/// A 256-bit vector of [16 x i16] containing one of the source operands.
1651/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1652static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1653_mm256_mulhrs_epi16(__m256i __a, __m256i __b) {
1654 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1655}
1656
1657/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1658/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1659/// [16 x i16] result.
1660///
1661/// \headerfile <immintrin.h>
1662///
1663/// This intrinsic corresponds to the \c VPMULHUW instruction.
1664///
1665/// \param __a
1666/// A 256-bit vector of [16 x i16] containing one of the source operands.
1667/// \param __b
1668/// A 256-bit vector of [16 x i16] containing one of the source operands.
1669/// \returns A 256-bit vector of [16 x i16] containing the products.
1670static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1671_mm256_mulhi_epu16(__m256i __a, __m256i __b) {
1672 return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
1673}
1674
1675/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1677/// [16 x i16] result.
1678///
1679/// \headerfile <immintrin.h>
1680///
1681/// This intrinsic corresponds to the \c VPMULHW instruction.
1682///
1683/// \param __a
1684/// A 256-bit vector of [16 x i16] containing one of the source operands.
1685/// \param __b
1686/// A 256-bit vector of [16 x i16] containing one of the source operands.
1687/// \returns A 256-bit vector of [16 x i16] containing the products.
1688static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1689_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1690{
1691 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1692}
1693
1694/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1695/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1696/// [16 x i16] result.
1697///
1698/// \headerfile <immintrin.h>
1699///
1700/// This intrinsic corresponds to the \c VPMULLW instruction.
1701///
1702/// \param __a
1703/// A 256-bit vector of [16 x i16] containing one of the source operands.
1704/// \param __b
1705/// A 256-bit vector of [16 x i16] containing one of the source operands.
1706/// \returns A 256-bit vector of [16 x i16] containing the products.
1707static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1708_mm256_mullo_epi16(__m256i __a, __m256i __b)
1709{
1710 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1711}
1712
1713/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1714/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1715/// [8 x i32] result.
1716///
1717/// \headerfile <immintrin.h>
1718///
1719/// This intrinsic corresponds to the \c VPMULLD instruction.
1720///
1721/// \param __a
1722/// A 256-bit vector of [8 x i32] containing one of the source operands.
1723/// \param __b
1724/// A 256-bit vector of [8 x i32] containing one of the source operands.
1725/// \returns A 256-bit vector of [8 x i32] containing the products.
1726static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1727_mm256_mullo_epi32(__m256i __a, __m256i __b) {
1728 return (__m256i)((__v8su)__a * (__v8su)__b);
1729}
1730
1731/// Multiplies unsigned 32-bit integers from even-numered elements of two
1732/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1733/// [4 x i64] result.
1734///
1735/// \code{.operation}
1736/// result[63:0] := __a[31:0] * __b[31:0]
1737/// result[127:64] := __a[95:64] * __b[95:64]
1738/// result[191:128] := __a[159:128] * __b[159:128]
1739/// result[255:192] := __a[223:192] * __b[223:192]
1740/// \endcode
1741///
1742/// \headerfile <immintrin.h>
1743///
1744/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1745///
1746/// \param __a
1747/// A 256-bit vector of [8 x i32] containing one of the source operands.
1748/// \param __b
1749/// A 256-bit vector of [8 x i32] containing one of the source operands.
1750/// \returns A 256-bit vector of [4 x i64] containing the products.
1751static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1752_mm256_mul_epu32(__m256i __a, __m256i __b) {
1753 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1754}
1755
1756/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1757/// \a __b.
1758///
1759/// \headerfile <immintrin.h>
1760///
1761/// This intrinsic corresponds to the \c VPOR instruction.
1762///
1763/// \param __a
1764/// A 256-bit integer vector.
1765/// \param __b
1766/// A 256-bit integer vector.
1767/// \returns A 256-bit integer vector containing the result.
1768static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1769_mm256_or_si256(__m256i __a, __m256i __b)
1770{
1771 return (__m256i)((__v4du)__a | (__v4du)__b);
1772}
1773
1774/// Computes four sum of absolute difference (SAD) operations on sets of eight
1775/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1776/// \a __b.
1777///
1778/// One SAD result is computed for each set of eight bytes from \a __a and
1779/// eight bytes from \a __b. The zero-extended SAD value is returned in the
1780/// corresponding 64-bit element of the result.
1781///
1782/// A single SAD operation takes the differences between the corresponding
1783/// bytes of \a __a and \a __b, takes the absolute value of each difference,
1784/// and sums these eight values to form one 16-bit result. This operation
1785/// is repeated four times with successive sets of eight bytes.
1786///
1787/// \code{.operation}
1788/// FOR i := 0 TO 3
1789/// j := i*64
1790/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1791/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1792/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1793/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1794/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1795/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1796/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1797/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1798/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1799/// temp4 + temp5 + temp6 + temp7
1800/// result[j+63:j+16] := 0
1801/// ENDFOR
1802/// \endcode
1803///
1804/// \headerfile <immintrin.h>
1805///
1806/// This intrinsic corresponds to the \c VPSADBW instruction.
1807///
1808/// \param __a
1809/// A 256-bit integer vector.
1810/// \param __b
1811/// A 256-bit integer vector.
1812/// \returns A 256-bit integer vector containing the result.
1813static __inline__ __m256i __DEFAULT_FN_ATTRS256
1814_mm256_sad_epu8(__m256i __a, __m256i __b)
1815{
1816 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1817}
1818
1819/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1820/// to control information in the 256-bit integer vector \a __b, and
1821/// returns the 256-bit result. In effect there are two separate 128-bit
1822/// shuffles in the lower and upper halves.
1823///
1824/// \code{.operation}
1825/// FOR i := 0 TO 31
1826/// j := i*8
1827/// IF __b[j+7] == 1
1828/// result[j+7:j] := 0
1829/// ELSE
1830/// k := __b[j+3:j] * 8
1831/// IF i > 15
1832/// k := k + 128
1833/// FI
1834/// result[j+7:j] := __a[k+7:k]
1835/// FI
1836/// ENDFOR
1837/// \endcode
1838///
1839/// \headerfile <immintrin.h>
1840///
1841/// This intrinsic corresponds to the \c VPSHUFB instruction.
1842///
1843/// \param __a
1844/// A 256-bit integer vector containing source values.
1845/// \param __b
1846/// A 256-bit integer vector containing control information to determine
1847/// what goes into the corresponding byte of the result. If bit 7 of the
1848/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1849/// control byte specify the index (within the same 128-bit half) of \a __a
1850/// to copy to the result byte.
1851/// \returns A 256-bit integer vector containing the result.
1852static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1853_mm256_shuffle_epi8(__m256i __a, __m256i __b) {
1854 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1855}
1856
1857/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1858/// according to control information in the integer literal \a imm, and
1859/// returns the 256-bit result. In effect there are two parallel 128-bit
1860/// shuffles in the lower and upper halves.
1861///
1862/// \code{.operation}
1863/// FOR i := 0 to 3
1864/// j := i*32
1865/// k := (imm >> i*2)[1:0] * 32
1866/// result[j+31:j] := a[k+31:k]
1867/// result[128+j+31:128+j] := a[128+k+31:128+k]
1868/// ENDFOR
1869/// \endcode
1870///
1871/// \headerfile <immintrin.h>
1872///
1873/// \code
1874/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1875/// \endcode
1876///
1877/// This intrinsic corresponds to the \c VPSHUFB instruction.
1878///
1879/// \param a
1880/// A 256-bit vector of [8 x i32] containing source values.
1881/// \param imm
1882/// An immediate 8-bit value specifying which elements to copy from \a a.
1883/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1884/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1885/// forth.
1886/// \returns A 256-bit vector of [8 x i32] containing the result.
1887#define _mm256_shuffle_epi32(a, imm) \
1888 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1889
1890/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1891/// according to control information in the integer literal \a imm, and
1892/// returns the 256-bit result. The upper 64 bits of each 128-bit half
1893/// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1894/// copied from \a a unchanged.
1895///
1896/// \code{.operation}
1897/// result[63:0] := a[63:0]
1898/// result[191:128] := a[191:128]
1899/// FOR i := 0 TO 3
1900/// j := i * 16 + 64
1901/// k := (imm >> i*2)[1:0] * 16 + 64
1902/// result[j+15:j] := a[k+15:k]
1903/// result[128+j+15:128+j] := a[128+k+15:128+k]
1904/// ENDFOR
1905/// \endcode
1906///
1907/// \headerfile <immintrin.h>
1908///
1909/// \code
1910/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1911/// \endcode
1912///
1913/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1914///
1915/// \param a
1916/// A 256-bit vector of [16 x i16] containing source values.
1917/// \param imm
1918/// An immediate 8-bit value specifying which elements to copy from \a a.
1919/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1920/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1921/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1922/// \returns A 256-bit vector of [16 x i16] containing the result.
1923#define _mm256_shufflehi_epi16(a, imm) \
1924 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1925
1926/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1927/// according to control information in the integer literal \a imm, and
1928/// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1929/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1930/// copied from \a a unchanged.
1931///
1932/// \code{.operation}
1933/// result[127:64] := a[127:64]
1934/// result[255:192] := a[255:192]
1935/// FOR i := 0 TO 3
1936/// j := i * 16
1937/// k := (imm >> i*2)[1:0] * 16
1938/// result[j+15:j] := a[k+15:k]
1939/// result[128+j+15:128+j] := a[128+k+15:128+k]
1940/// ENDFOR
1941/// \endcode
1942///
1943/// \headerfile <immintrin.h>
1944///
1945/// \code
1946/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1947/// \endcode
1948///
1949/// This intrinsic corresponds to the \c VPSHUFLW instruction.
1950///
1951/// \param a
1952/// A 256-bit vector of [16 x i16] to use as a source of data for the
1953/// result.
1954/// \param imm
1955/// An immediate 8-bit value specifying which elements to copy from \a a.
1956/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
1957/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
1958/// forth.
1959/// \returns A 256-bit vector of [16 x i16] containing the result.
1960#define _mm256_shufflelo_epi16(a, imm) \
1961 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
1962
1963/// Sets each byte of the result to the corresponding byte of the 256-bit
1964/// integer vector in \a __a, the negative of that byte, or zero, depending
1965/// on whether the corresponding byte of the 256-bit integer vector in
1966/// \a __b is greater than zero, less than zero, or equal to zero,
1967/// respectively.
1968///
1969/// \headerfile <immintrin.h>
1970///
1971/// This intrinsic corresponds to the \c VPSIGNB instruction.
1972///
1973/// \param __a
1974/// A 256-bit integer vector.
1975/// \param __b
1976/// A 256-bit integer vector].
1977/// \returns A 256-bit integer vector containing the result.
1978static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1979_mm256_sign_epi8(__m256i __a, __m256i __b) {
1980 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
1981}
1982
1983/// Sets each element of the result to the corresponding element of the
1984/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
1985/// or zero, depending on whether the corresponding element of the 256-bit
1986/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
1987/// equal to zero, respectively.
1988///
1989/// \headerfile <immintrin.h>
1990///
1991/// This intrinsic corresponds to the \c VPSIGNW instruction.
1992///
1993/// \param __a
1994/// A 256-bit vector of [16 x i16].
1995/// \param __b
1996/// A 256-bit vector of [16 x i16].
1997/// \returns A 256-bit vector of [16 x i16] containing the result.
1998static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1999_mm256_sign_epi16(__m256i __a, __m256i __b) {
2000 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2001}
2002
2003/// Sets each element of the result to the corresponding element of the
2004/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2005/// zero, depending on whether the corresponding element of the 256-bit
2006/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2007/// equal to zero, respectively.
2008///
2009/// \headerfile <immintrin.h>
2010///
2011/// This intrinsic corresponds to the \c VPSIGND instruction.
2012///
2013/// \param __a
2014/// A 256-bit vector of [8 x i32].
2015/// \param __b
2016/// A 256-bit vector of [8 x i32].
2017/// \returns A 256-bit vector of [8 x i32] containing the result.
2018static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2019_mm256_sign_epi32(__m256i __a, __m256i __b) {
2020 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2021}
2022
2023/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2024/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2025/// is greater than 15, the returned result is all zeroes.
2026///
2027/// \headerfile <immintrin.h>
2028///
2029/// \code
2030/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2031/// \endcode
2032///
2033/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2034///
2035/// \param a
2036/// A 256-bit integer vector to be shifted.
2037/// \param imm
2038/// An unsigned immediate value specifying the shift count (in bytes).
2039/// \returns A 256-bit integer vector containing the result.
2040#define _mm256_slli_si256(a, imm) \
2041 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2042 (int)(imm)))
2043
2044/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2045/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2046/// is greater than 15, the returned result is all zeroes.
2047///
2048/// \headerfile <immintrin.h>
2049///
2050/// \code
2051/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2052/// \endcode
2053///
2054/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2055///
2056/// \param a
2057/// A 256-bit integer vector to be shifted.
2058/// \param imm
2059/// An unsigned immediate value specifying the shift count (in bytes).
2060/// \returns A 256-bit integer vector containing the result.
2061#define _mm256_bslli_epi128(a, imm) \
2062 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2063 (int)(imm)))
2064
2065/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2066/// left by \a __count bits, shifting in zero bits, and returns the result.
2067/// If \a __count is greater than 15, the returned result is all zeroes.
2068///
2069/// \headerfile <immintrin.h>
2070///
2071/// This intrinsic corresponds to the \c VPSLLW instruction.
2072///
2073/// \param __a
2074/// A 256-bit vector of [16 x i16] to be shifted.
2075/// \param __count
2076/// An unsigned integer value specifying the shift count (in bits).
2077/// \returns A 256-bit vector of [16 x i16] containing the result.
2078static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2079_mm256_slli_epi16(__m256i __a, int __count) {
2080 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2081}
2082
2083/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2084/// left by the number of bits specified by the lower 64 bits of \a __count,
2085/// shifting in zero bits, and returns the result. If \a __count is greater
2086/// than 15, the returned result is all zeroes.
2087///
2088/// \headerfile <immintrin.h>
2089///
2090/// This intrinsic corresponds to the \c VPSLLW instruction.
2091///
2092/// \param __a
2093/// A 256-bit vector of [16 x i16] to be shifted.
2094/// \param __count
2095/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2096/// shift count (in bits). The upper element is ignored.
2097/// \returns A 256-bit vector of [16 x i16] containing the result.
2098static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2099_mm256_sll_epi16(__m256i __a, __m128i __count) {
2100 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2101}
2102
2103/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2104/// left by \a __count bits, shifting in zero bits, and returns the result.
2105/// If \a __count is greater than 31, the returned result is all zeroes.
2106///
2107/// \headerfile <immintrin.h>
2108///
2109/// This intrinsic corresponds to the \c VPSLLD instruction.
2110///
2111/// \param __a
2112/// A 256-bit vector of [8 x i32] to be shifted.
2113/// \param __count
2114/// An unsigned integer value specifying the shift count (in bits).
2115/// \returns A 256-bit vector of [8 x i32] containing the result.
2116static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2117_mm256_slli_epi32(__m256i __a, int __count) {
2118 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2119}
2120
2121/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2122/// left by the number of bits given in the lower 64 bits of \a __count,
2123/// shifting in zero bits, and returns the result. If \a __count is greater
2124/// than 31, the returned result is all zeroes.
2125///
2126/// \headerfile <immintrin.h>
2127///
2128/// This intrinsic corresponds to the \c VPSLLD instruction.
2129///
2130/// \param __a
2131/// A 256-bit vector of [8 x i32] to be shifted.
2132/// \param __count
2133/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2134/// shift count (in bits). The upper element is ignored.
2135/// \returns A 256-bit vector of [8 x i32] containing the result.
2136static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2137_mm256_sll_epi32(__m256i __a, __m128i __count) {
2138 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2139}
2140
2141/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2142/// left by \a __count bits, shifting in zero bits, and returns the result.
2143/// If \a __count is greater than 63, the returned result is all zeroes.
2144///
2145/// \headerfile <immintrin.h>
2146///
2147/// This intrinsic corresponds to the \c VPSLLQ instruction.
2148///
2149/// \param __a
2150/// A 256-bit vector of [4 x i64] to be shifted.
2151/// \param __count
2152/// An unsigned integer value specifying the shift count (in bits).
2153/// \returns A 256-bit vector of [4 x i64] containing the result.
2154static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2155_mm256_slli_epi64(__m256i __a, int __count) {
2156 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2157}
2158
2159/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2160/// left by the number of bits given in the lower 64 bits of \a __count,
2161/// shifting in zero bits, and returns the result. If \a __count is greater
2162/// than 63, the returned result is all zeroes.
2163///
2164/// \headerfile <immintrin.h>
2165///
2166/// This intrinsic corresponds to the \c VPSLLQ instruction.
2167///
2168/// \param __a
2169/// A 256-bit vector of [4 x i64] to be shifted.
2170/// \param __count
2171/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2172/// shift count (in bits). The upper element is ignored.
2173/// \returns A 256-bit vector of [4 x i64] containing the result.
2174static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2175_mm256_sll_epi64(__m256i __a, __m128i __count) {
2176 return __builtin_ia32_psllq256((__v4di)__a, __count);
2177}
2178
2179/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2180/// right by \a __count bits, shifting in sign bits, and returns the result.
2181/// If \a __count is greater than 15, each element of the result is either
2182/// 0 or -1 according to the corresponding input sign bit.
2183///
2184/// \headerfile <immintrin.h>
2185///
2186/// This intrinsic corresponds to the \c VPSRAW instruction.
2187///
2188/// \param __a
2189/// A 256-bit vector of [16 x i16] to be shifted.
2190/// \param __count
2191/// An unsigned integer value specifying the shift count (in bits).
2192/// \returns A 256-bit vector of [16 x i16] containing the result.
2193static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2194_mm256_srai_epi16(__m256i __a, int __count) {
2195 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2196}
2197
2198/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2199/// right by the number of bits given in the lower 64 bits of \a __count,
2200/// shifting in sign bits, and returns the result. If \a __count is greater
2201/// than 15, each element of the result is either 0 or -1 according to the
2202/// corresponding input sign bit.
2203///
2204/// \headerfile <immintrin.h>
2205///
2206/// This intrinsic corresponds to the \c VPSRAW instruction.
2207///
2208/// \param __a
2209/// A 256-bit vector of [16 x i16] to be shifted.
2210/// \param __count
2211/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2212/// shift count (in bits). The upper element is ignored.
2213/// \returns A 256-bit vector of [16 x i16] containing the result.
2214static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2215_mm256_sra_epi16(__m256i __a, __m128i __count) {
2216 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2217}
2218
2219/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2220/// right by \a __count bits, shifting in sign bits, and returns the result.
2221/// If \a __count is greater than 31, each element of the result is either
2222/// 0 or -1 according to the corresponding input sign bit.
2223///
2224/// \headerfile <immintrin.h>
2225///
2226/// This intrinsic corresponds to the \c VPSRAD instruction.
2227///
2228/// \param __a
2229/// A 256-bit vector of [8 x i32] to be shifted.
2230/// \param __count
2231/// An unsigned integer value specifying the shift count (in bits).
2232/// \returns A 256-bit vector of [8 x i32] containing the result.
2233static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2234_mm256_srai_epi32(__m256i __a, int __count) {
2235 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2236}
2237
2238/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2239/// right by the number of bits given in the lower 64 bits of \a __count,
2240/// shifting in sign bits, and returns the result. If \a __count is greater
2241/// than 31, each element of the result is either 0 or -1 according to the
2242/// corresponding input sign bit.
2243///
2244/// \headerfile <immintrin.h>
2245///
2246/// This intrinsic corresponds to the \c VPSRAD instruction.
2247///
2248/// \param __a
2249/// A 256-bit vector of [8 x i32] to be shifted.
2250/// \param __count
2251/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2252/// shift count (in bits). The upper element is ignored.
2253/// \returns A 256-bit vector of [8 x i32] containing the result.
2254static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2255_mm256_sra_epi32(__m256i __a, __m128i __count) {
2256 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2257}
2258
2259/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2260/// \a imm bytes, shifting in zero bytes, and returns the result. If
2261/// \a imm is greater than 15, the returned result is all zeroes.
2262///
2263/// \headerfile <immintrin.h>
2264///
2265/// \code
2266/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2267/// \endcode
2268///
2269/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2270///
2271/// \param a
2272/// A 256-bit integer vector to be shifted.
2273/// \param imm
2274/// An unsigned immediate value specifying the shift count (in bytes).
2275/// \returns A 256-bit integer vector containing the result.
2276#define _mm256_srli_si256(a, imm) \
2277 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2278 (int)(imm)))
2279
2280/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2281/// \a imm bytes, shifting in zero bytes, and returns the result. If
2282/// \a imm is greater than 15, the returned result is all zeroes.
2283///
2284/// \headerfile <immintrin.h>
2285///
2286/// \code
2287/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2288/// \endcode
2289///
2290/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2291///
2292/// \param a
2293/// A 256-bit integer vector to be shifted.
2294/// \param imm
2295/// An unsigned immediate value specifying the shift count (in bytes).
2296/// \returns A 256-bit integer vector containing the result.
2297#define _mm256_bsrli_epi128(a, imm) \
2298 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2299 (int)(imm)))
2300
2301/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2302/// right by \a __count bits, shifting in zero bits, and returns the result.
2303/// If \a __count is greater than 15, the returned result is all zeroes.
2304///
2305/// \headerfile <immintrin.h>
2306///
2307/// This intrinsic corresponds to the \c VPSRLW instruction.
2308///
2309/// \param __a
2310/// A 256-bit vector of [16 x i16] to be shifted.
2311/// \param __count
2312/// An unsigned integer value specifying the shift count (in bits).
2313/// \returns A 256-bit vector of [16 x i16] containing the result.
2314static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2315_mm256_srli_epi16(__m256i __a, int __count) {
2316 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2317}
2318
2319/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2320/// right by the number of bits given in the lower 64 bits of \a __count,
2321/// shifting in zero bits, and returns the result. If \a __count is greater
2322/// than 15, the returned result is all zeroes.
2323///
2324/// \headerfile <immintrin.h>
2325///
2326/// This intrinsic corresponds to the \c VPSRLW instruction.
2327///
2328/// \param __a
2329/// A 256-bit vector of [16 x i16] to be shifted.
2330/// \param __count
2331/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2332/// shift count (in bits). The upper element is ignored.
2333/// \returns A 256-bit vector of [16 x i16] containing the result.
2334static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2335_mm256_srl_epi16(__m256i __a, __m128i __count) {
2336 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2337}
2338
2339/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2340/// right by \a __count bits, shifting in zero bits, and returns the result.
2341/// If \a __count is greater than 31, the returned result is all zeroes.
2342///
2343/// \headerfile <immintrin.h>
2344///
2345/// This intrinsic corresponds to the \c VPSRLD instruction.
2346///
2347/// \param __a
2348/// A 256-bit vector of [8 x i32] to be shifted.
2349/// \param __count
2350/// An unsigned integer value specifying the shift count (in bits).
2351/// \returns A 256-bit vector of [8 x i32] containing the result.
2352static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2353_mm256_srli_epi32(__m256i __a, int __count) {
2354 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2355}
2356
2357/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2358/// right by the number of bits given in the lower 64 bits of \a __count,
2359/// shifting in zero bits, and returns the result. If \a __count is greater
2360/// than 31, the returned result is all zeroes.
2361///
2362/// \headerfile <immintrin.h>
2363///
2364/// This intrinsic corresponds to the \c VPSRLD instruction.
2365///
2366/// \param __a
2367/// A 256-bit vector of [8 x i32] to be shifted.
2368/// \param __count
2369/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2370/// shift count (in bits). The upper element is ignored.
2371/// \returns A 256-bit vector of [8 x i32] containing the result.
2372static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2373_mm256_srl_epi32(__m256i __a, __m128i __count) {
2374 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2375}
2376
2377/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2378/// right by \a __count bits, shifting in zero bits, and returns the result.
2379/// If \a __count is greater than 63, the returned result is all zeroes.
2380///
2381/// \headerfile <immintrin.h>
2382///
2383/// This intrinsic corresponds to the \c VPSRLQ instruction.
2384///
2385/// \param __a
2386/// A 256-bit vector of [4 x i64] to be shifted.
2387/// \param __count
2388/// An unsigned integer value specifying the shift count (in bits).
2389/// \returns A 256-bit vector of [4 x i64] containing the result.
2390static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2391_mm256_srli_epi64(__m256i __a, int __count) {
2392 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2393}
2394
2395/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2396/// right by the number of bits given in the lower 64 bits of \a __count,
2397/// shifting in zero bits, and returns the result. If \a __count is greater
2398/// than 63, the returned result is all zeroes.
2399///
2400/// \headerfile <immintrin.h>
2401///
2402/// This intrinsic corresponds to the \c VPSRLQ instruction.
2403///
2404/// \param __a
2405/// A 256-bit vector of [4 x i64] to be shifted.
2406/// \param __count
2407/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2408/// shift count (in bits). The upper element is ignored.
2409/// \returns A 256-bit vector of [4 x i64] containing the result.
2410static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2411_mm256_srl_epi64(__m256i __a, __m128i __count) {
2412 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2413}
2414
2415/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2416/// vectors. Returns the lower 8 bits of each difference in the
2417/// corresponding byte of the 256-bit integer vector result (overflow is
2418/// ignored).
2419///
2420/// \code{.operation}
2421/// FOR i := 0 TO 31
2422/// j := i*8
2423/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2424/// ENDFOR
2425/// \endcode
2426///
2427/// \headerfile <immintrin.h>
2428///
2429/// This intrinsic corresponds to the \c VPSUBB instruction.
2430///
2431/// \param __a
2432/// A 256-bit integer vector containing the minuends.
2433/// \param __b
2434/// A 256-bit integer vector containing the subtrahends.
2435/// \returns A 256-bit integer vector containing the differences.
2436static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2437_mm256_sub_epi8(__m256i __a, __m256i __b) {
2438 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2439}
2440
2441/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2442/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2443/// the corresponding element of the [16 x i16] result (overflow is
2444/// ignored).
2445///
2446/// \code{.operation}
2447/// FOR i := 0 TO 15
2448/// j := i*16
2449/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2450/// ENDFOR
2451/// \endcode
2452///
2453/// \headerfile <immintrin.h>
2454///
2455/// This intrinsic corresponds to the \c VPSUBW instruction.
2456///
2457/// \param __a
2458/// A 256-bit vector of [16 x i16] containing the minuends.
2459/// \param __b
2460/// A 256-bit vector of [16 x i16] containing the subtrahends.
2461/// \returns A 256-bit vector of [16 x i16] containing the differences.
2462static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2463_mm256_sub_epi16(__m256i __a, __m256i __b) {
2464 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2465}
2466
2467/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2468/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2469/// the corresponding element of the [8 x i32] result (overflow is ignored).
2470///
2471/// \code{.operation}
2472/// FOR i := 0 TO 7
2473/// j := i*32
2474/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2475/// ENDFOR
2476/// \endcode
2477///
2478/// \headerfile <immintrin.h>
2479///
2480/// This intrinsic corresponds to the \c VPSUBD instruction.
2481///
2482/// \param __a
2483/// A 256-bit vector of [8 x i32] containing the minuends.
2484/// \param __b
2485/// A 256-bit vector of [8 x i32] containing the subtrahends.
2486/// \returns A 256-bit vector of [8 x i32] containing the differences.
2487static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2488_mm256_sub_epi32(__m256i __a, __m256i __b) {
2489 return (__m256i)((__v8su)__a - (__v8su)__b);
2490}
2491
2492/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2493/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2494/// the corresponding element of the [4 x i64] result (overflow is ignored).
2495///
2496/// \code{.operation}
2497/// FOR i := 0 TO 3
2498/// j := i*64
2499/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2500/// ENDFOR
2501/// \endcode
2502///
2503/// \headerfile <immintrin.h>
2504///
2505/// This intrinsic corresponds to the \c VPSUBQ instruction.
2506///
2507/// \param __a
2508/// A 256-bit vector of [4 x i64] containing the minuends.
2509/// \param __b
2510/// A 256-bit vector of [4 x i64] containing the subtrahends.
2511/// \returns A 256-bit vector of [4 x i64] containing the differences.
2512static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2513_mm256_sub_epi64(__m256i __a, __m256i __b) {
2514 return (__m256i)((__v4du)__a - (__v4du)__b);
2515}
2516
2517/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2518/// vectors using signed saturation, and returns each differences in the
2519/// corresponding byte of the 256-bit integer vector result.
2520///
2521/// \code{.operation}
2522/// FOR i := 0 TO 31
2523/// j := i*8
2524/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2525/// ENDFOR
2526/// \endcode
2527///
2528/// \headerfile <immintrin.h>
2529///
2530/// This intrinsic corresponds to the \c VPSUBSB instruction.
2531///
2532/// \param __a
2533/// A 256-bit integer vector containing the minuends.
2534/// \param __b
2535/// A 256-bit integer vector containing the subtrahends.
2536/// \returns A 256-bit integer vector containing the differences.
2537static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2538_mm256_subs_epi8(__m256i __a, __m256i __b) {
2539 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2540}
2541
2542/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2543/// vectors of [16 x i16] using signed saturation, and returns each
2544/// difference in the corresponding element of the [16 x i16] result.
2545///
2546/// \code{.operation}
2547/// FOR i := 0 TO 15
2548/// j := i*16
2549/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2550/// ENDFOR
2551/// \endcode
2552///
2553/// \headerfile <immintrin.h>
2554///
2555/// This intrinsic corresponds to the \c VPSUBSW instruction.
2556///
2557/// \param __a
2558/// A 256-bit vector of [16 x i16] containing the minuends.
2559/// \param __b
2560/// A 256-bit vector of [16 x i16] containing the subtrahends.
2561/// \returns A 256-bit vector of [16 x i16] containing the differences.
2562static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2563_mm256_subs_epi16(__m256i __a, __m256i __b) {
2564 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2565}
2566
2567/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2568/// vectors using unsigned saturation, and returns each difference in the
2569/// corresponding byte of the 256-bit integer vector result. For each byte,
2570/// computes <c> result = __a - __b </c>.
2571///
2572/// \code{.operation}
2573/// FOR i := 0 TO 31
2574/// j := i*8
2575/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2576/// ENDFOR
2577/// \endcode
2578///
2579/// \headerfile <immintrin.h>
2580///
2581/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2582///
2583/// \param __a
2584/// A 256-bit integer vector containing the minuends.
2585/// \param __b
2586/// A 256-bit integer vector containing the subtrahends.
2587/// \returns A 256-bit integer vector containing the differences.
2588static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2589_mm256_subs_epu8(__m256i __a, __m256i __b) {
2590 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2591}
2592
2593/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2594/// vectors of [16 x i16] using unsigned saturation, and returns each
2595/// difference in the corresponding element of the [16 x i16] result.
2596///
2597/// \code{.operation}
2598/// FOR i := 0 TO 15
2599/// j := i*16
2600/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2601/// ENDFOR
2602/// \endcode
2603///
2604/// \headerfile <immintrin.h>
2605///
2606/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2607///
2608/// \param __a
2609/// A 256-bit vector of [16 x i16] containing the minuends.
2610/// \param __b
2611/// A 256-bit vector of [16 x i16] containing the subtrahends.
2612/// \returns A 256-bit vector of [16 x i16] containing the differences.
2613static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2614_mm256_subs_epu16(__m256i __a, __m256i __b) {
2615 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2616}
2617
2618/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2619/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2620/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2621/// input; other bits in these parameters are ignored.
2622///
2623/// \code{.operation}
2624/// result[7:0] := __a[71:64]
2625/// result[15:8] := __b[71:64]
2626/// result[23:16] := __a[79:72]
2627/// result[31:24] := __b[79:72]
2628/// . . .
2629/// result[127:120] := __b[127:120]
2630/// result[135:128] := __a[199:192]
2631/// . . .
2632/// result[255:248] := __b[255:248]
2633/// \endcode
2634///
2635/// \headerfile <immintrin.h>
2636///
2637/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2638///
2639/// \param __a
2640/// A 256-bit integer vector used as the source for the even-numbered bytes
2641/// of the result.
2642/// \param __b
2643/// A 256-bit integer vector used as the source for the odd-numbered bytes
2644/// of the result.
2645/// \returns A 256-bit integer vector containing the result.
2646static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2647_mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
2648 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2649}
2650
2651/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2652/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2653/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2654/// 128-bit half of \a __a and \a __b as input; other bits in these
2655/// parameters are ignored.
2656///
2657/// \code{.operation}
2658/// result[15:0] := __a[79:64]
2659/// result[31:16] := __b[79:64]
2660/// result[47:32] := __a[95:80]
2661/// result[63:48] := __b[95:80]
2662/// . . .
2663/// result[127:112] := __b[127:112]
2664/// result[143:128] := __a[211:196]
2665/// . . .
2666/// result[255:240] := __b[255:240]
2667/// \endcode
2668///
2669/// \headerfile <immintrin.h>
2670///
2671/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2672///
2673/// \param __a
2674/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2675/// elements of the result.
2676/// \param __b
2677/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2678/// elements of the result.
2679/// \returns A 256-bit vector of [16 x i16] containing the result.
2680static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2681_mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
2682 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2683}
2684
2685/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2686/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2687/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2688/// of \a __a and \a __b as input; other bits in these parameters are
2689/// ignored.
2690///
2691/// \code{.operation}
2692/// result[31:0] := __a[95:64]
2693/// result[63:32] := __b[95:64]
2694/// result[95:64] := __a[127:96]
2695/// result[127:96] := __b[127:96]
2696/// result[159:128] := __a[223:192]
2697/// result[191:160] := __b[223:192]
2698/// result[223:192] := __a[255:224]
2699/// result[255:224] := __b[255:224]
2700/// \endcode
2701///
2702/// \headerfile <immintrin.h>
2703///
2704/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2705///
2706/// \param __a
2707/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2708/// elements of the result.
2709/// \param __b
2710/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2711/// elements of the result.
2712/// \returns A 256-bit vector of [8 x i32] containing the result.
2713static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2714_mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
2715 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2716}
2717
2718/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2719/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2720/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2721/// of \a __a and \a __b as input; other bits in these parameters are
2722/// ignored.
2723///
2724/// \code{.operation}
2725/// result[63:0] := __a[127:64]
2726/// result[127:64] := __b[127:64]
2727/// result[191:128] := __a[255:192]
2728/// result[255:192] := __b[255:192]
2729/// \endcode
2730///
2731/// \headerfile <immintrin.h>
2732///
2733/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2734///
2735/// \param __a
2736/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2737/// elements of the result.
2738/// \param __b
2739/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2740/// elements of the result.
2741/// \returns A 256-bit vector of [4 x i64] containing the result.
2742static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2743_mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
2744 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2745}
2746
2747/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2748/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2749/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2750/// input; other bits in these parameters are ignored.
2751///
2752/// \code{.operation}
2753/// result[7:0] := __a[7:0]
2754/// result[15:8] := __b[7:0]
2755/// result[23:16] := __a[15:8]
2756/// result[31:24] := __b[15:8]
2757/// . . .
2758/// result[127:120] := __b[63:56]
2759/// result[135:128] := __a[135:128]
2760/// . . .
2761/// result[255:248] := __b[191:184]
2762/// \endcode
2763///
2764/// \headerfile <immintrin.h>
2765///
2766/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2767///
2768/// \param __a
2769/// A 256-bit integer vector used as the source for the even-numbered bytes
2770/// of the result.
2771/// \param __b
2772/// A 256-bit integer vector used as the source for the odd-numbered bytes
2773/// of the result.
2774/// \returns A 256-bit integer vector containing the result.
2775static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2776_mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
2777 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2778}
2779
2780/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2781/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2782/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2783/// 128-bit half of \a __a and \a __b as input; other bits in these
2784/// parameters are ignored.
2785///
2786/// \code{.operation}
2787/// result[15:0] := __a[15:0]
2788/// result[31:16] := __b[15:0]
2789/// result[47:32] := __a[31:16]
2790/// result[63:48] := __b[31:16]
2791/// . . .
2792/// result[127:112] := __b[63:48]
2793/// result[143:128] := __a[143:128]
2794/// . . .
2795/// result[255:239] := __b[191:176]
2796/// \endcode
2797///
2798/// \headerfile <immintrin.h>
2799///
2800/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2801///
2802/// \param __a
2803/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2804/// elements of the result.
2805/// \param __b
2806/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2807/// elements of the result.
2808/// \returns A 256-bit vector of [16 x i16] containing the result.
2809static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2810_mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
2811 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2812}
2813
2814/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2815/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2816/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2817/// of \a __a and \a __b as input; other bits in these parameters are
2818/// ignored.
2819///
2820/// \code{.operation}
2821/// result[31:0] := __a[31:0]
2822/// result[63:32] := __b[31:0]
2823/// result[95:64] := __a[63:32]
2824/// result[127:96] := __b[63:32]
2825/// result[159:128] := __a[159:128]
2826/// result[191:160] := __b[159:128]
2827/// result[223:192] := __a[191:160]
2828/// result[255:224] := __b[191:190]
2829/// \endcode
2830///
2831/// \headerfile <immintrin.h>
2832///
2833/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2834///
2835/// \param __a
2836/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2837/// elements of the result.
2838/// \param __b
2839/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2840/// elements of the result.
2841/// \returns A 256-bit vector of [8 x i32] containing the result.
2842static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2843_mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
2844 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2845}
2846
2847/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2848/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2849/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2850/// of \a __a and \a __b as input; other bits in these parameters are
2851/// ignored.
2852///
2853/// \code{.operation}
2854/// result[63:0] := __a[63:0]
2855/// result[127:64] := __b[63:0]
2856/// result[191:128] := __a[191:128]
2857/// result[255:192] := __b[191:128]
2858/// \endcode
2859///
2860/// \headerfile <immintrin.h>
2861///
2862/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2863///
2864/// \param __a
2865/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2866/// elements of the result.
2867/// \param __b
2868/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2869/// elements of the result.
2870/// \returns A 256-bit vector of [4 x i64] containing the result.
2871static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2872_mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
2873 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2874}
2875
2876/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2877/// \a __b.
2878///
2879/// \headerfile <immintrin.h>
2880///
2881/// This intrinsic corresponds to the \c VPXOR instruction.
2882///
2883/// \param __a
2884/// A 256-bit integer vector.
2885/// \param __b
2886/// A 256-bit integer vector.
2887/// \returns A 256-bit integer vector containing the result.
2888static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2889_mm256_xor_si256(__m256i __a, __m256i __b)
2890{
2891 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2892}
2893
2894/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2895/// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2896/// boundary.
2897///
2898/// \headerfile <immintrin.h>
2899///
2900/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2901///
2902/// \param __V
2903/// A pointer to the 32-byte aligned memory containing the vector to load.
2904/// \returns A 256-bit integer vector loaded from memory.
2905static __inline__ __m256i __DEFAULT_FN_ATTRS256
2907{
2908 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2909 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2910}
2911
2912/// Broadcasts the 32-bit floating-point value from the low element of the
2913/// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2914/// 128-bit vector of [4 x float].
2915///
2916/// \headerfile <immintrin.h>
2917///
2918/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2919///
2920/// \param __X
2921/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2922/// \returns A 128-bit vector of [4 x float] containing the result.
2923static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
2925 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
2926}
2927
2928/// Broadcasts the 64-bit floating-point value from the low element of the
2929/// 128-bit vector of [2 x double] in \a __a to both elements of the
2930/// result's 128-bit vector of [2 x double].
2931///
2932/// \headerfile <immintrin.h>
2933///
2934/// This intrinsic corresponds to the \c MOVDDUP instruction.
2935///
2936/// \param __a
2937/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2938/// \returns A 128-bit vector of [2 x double] containing the result.
2939static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
2941 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
2942}
2943
2944/// Broadcasts the 32-bit floating-point value from the low element of the
2945/// 128-bit vector of [4 x float] in \a __X to all elements of the
2946/// result's 256-bit vector of [8 x float].
2947///
2948/// \headerfile <immintrin.h>
2949///
2950/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2951///
2952/// \param __X
2953/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2954/// \returns A 256-bit vector of [8 x float] containing the result.
2955static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
2957 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2958}
2959
2960/// Broadcasts the 64-bit floating-point value from the low element of the
2961/// 128-bit vector of [2 x double] in \a __X to all elements of the
2962/// result's 256-bit vector of [4 x double].
2963///
2964/// \headerfile <immintrin.h>
2965///
2966/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
2967///
2968/// \param __X
2969/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2970/// \returns A 256-bit vector of [4 x double] containing the result.
2971static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
2973 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
2974}
2975
2976/// Broadcasts the 128-bit integer data from \a __X to both the lower and
2977/// upper halves of the 256-bit result.
2978///
2979/// \headerfile <immintrin.h>
2980///
2981/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
2982///
2983/// \param __X
2984/// A 128-bit integer vector to be broadcast.
2985/// \returns A 256-bit integer vector containing the result.
2986static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2988 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
2989}
2990
2991#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
2992
2993/// Merges 32-bit integer elements from either of the two 128-bit vectors of
2994/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
2995/// as specified by the immediate integer operand \a M.
2996///
2997/// \code{.operation}
2998/// FOR i := 0 TO 3
2999/// j := i*32
3000/// IF M[i] == 0
3001/// result[31+j:j] := V1[31+j:j]
3002/// ELSE
3003/// result[31+j:j] := V2[32+j:j]
3004/// FI
3005/// ENDFOR
3006/// \endcode
3007///
3008/// \headerfile <immintrin.h>
3009///
3010/// \code
3011/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3012/// \endcode
3013///
3014/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3015///
3016/// \param V1
3017/// A 128-bit vector of [4 x i32] containing source values.
3018/// \param V2
3019/// A 128-bit vector of [4 x i32] containing source values.
3020/// \param M
3021/// An immediate 8-bit integer operand, with bits [3:0] specifying the
3022/// source for each element of the result. The position of the mask bit
3023/// corresponds to the index of a copied value. When a mask bit is 0, the
3024/// element is copied from \a V1; otherwise, it is copied from \a V2.
3025/// \returns A 128-bit vector of [4 x i32] containing the result.
3026#define _mm_blend_epi32(V1, V2, M) \
3027 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3028 (__v4si)(__m128i)(V2), (int)(M)))
3029
3030/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3031/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3032/// as specified by the immediate integer operand \a M.
3033///
3034/// \code{.operation}
3035/// FOR i := 0 TO 7
3036/// j := i*32
3037/// IF M[i] == 0
3038/// result[31+j:j] := V1[31+j:j]
3039/// ELSE
3040/// result[31+j:j] := V2[32+j:j]
3041/// FI
3042/// ENDFOR
3043/// \endcode
3044///
3045/// \headerfile <immintrin.h>
3046///
3047/// \code
3048/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3049/// \endcode
3050///
3051/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3052///
3053/// \param V1
3054/// A 256-bit vector of [8 x i32] containing source values.
3055/// \param V2
3056/// A 256-bit vector of [8 x i32] containing source values.
3057/// \param M
3058/// An immediate 8-bit integer operand, with bits [7:0] specifying the
3059/// source for each element of the result. The position of the mask bit
3060/// corresponds to the index of a copied value. When a mask bit is 0, the
3061/// element is copied from \a V1; otherwise, it is is copied from \a V2.
3062/// \returns A 256-bit vector of [8 x i32] containing the result.
3063#define _mm256_blend_epi32(V1, V2, M) \
3064 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3065 (__v8si)(__m256i)(V2), (int)(M)))
3066
3067/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3068/// bytes of the 256-bit result.
3069///
3070/// \headerfile <immintrin.h>
3071///
3072/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3073///
3074/// \param __X
3075/// A 128-bit integer vector whose low byte will be broadcast.
3076/// \returns A 256-bit integer vector containing the result.
3077static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3079 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3080}
3081
3082/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3083/// to all elements of the result's 256-bit vector of [16 x i16].
3084///
3085/// \headerfile <immintrin.h>
3086///
3087/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3088///
3089/// \param __X
3090/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3091/// \returns A 256-bit vector of [16 x i16] containing the result.
3092static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3094 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3095}
3096
3097/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3098/// to all elements of the result's 256-bit vector of [8 x i32].
3099///
3100/// \headerfile <immintrin.h>
3101///
3102/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3103///
3104/// \param __X
3105/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3106/// \returns A 256-bit vector of [8 x i32] containing the result.
3107static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3109 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3110}
3111
3112/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3113/// to all elements of the result's 256-bit vector of [4 x i64].
3114///
3115/// \headerfile <immintrin.h>
3116///
3117/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3118///
3119/// \param __X
3120/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3121/// \returns A 256-bit vector of [4 x i64] containing the result.
3122static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3124 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3125}
3126
3127/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3128/// bytes of the 128-bit result.
3129///
3130/// \headerfile <immintrin.h>
3131///
3132/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3133///
3134/// \param __X
3135/// A 128-bit integer vector whose low byte will be broadcast.
3136/// \returns A 128-bit integer vector containing the result.
3137static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3139 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3140}
3141
3142/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3143/// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3144///
3145/// \headerfile <immintrin.h>
3146///
3147/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3148///
3149/// \param __X
3150/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3151/// \returns A 128-bit vector of [8 x i16] containing the result.
3152static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3154 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3155}
3156
3157/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3158/// to all elements of the result's vector of [4 x i32].
3159///
3160/// \headerfile <immintrin.h>
3161///
3162/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3163///
3164/// \param __X
3165/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3166/// \returns A 128-bit vector of [4 x i32] containing the result.
3167static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3169 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3170}
3171
3172/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3173/// to both elements of the result's 128-bit vector of [2 x i64].
3174///
3175/// \headerfile <immintrin.h>
3176///
3177/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3178///
3179/// \param __X
3180/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3181/// \returns A 128-bit vector of [2 x i64] containing the result.
3182static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3184 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3185}
3186
3187/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3188/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3189/// elements of the 256-bit vector of [8 x i32] in \a __b.
3190///
3191/// \code{.operation}
3192/// FOR i := 0 TO 7
3193/// j := i*32
3194/// k := __b[j+2:j] * 32
3195/// result[j+31:j] := __a[k+31:k]
3196/// ENDFOR
3197/// \endcode
3198///
3199/// \headerfile <immintrin.h>
3200///
3201/// This intrinsic corresponds to the \c VPERMD instruction.
3202///
3203/// \param __a
3204/// A 256-bit vector of [8 x i32] containing the source values.
3205/// \param __b
3206/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3207/// \a __a.
3208/// \returns A 256-bit vector of [8 x i32] containing the result.
3209static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3211 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3212}
3213
3214/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3215/// the 256-bit vector of [4 x double] in \a V as specified by the
3216/// immediate value \a M.
3217///
3218/// \code{.operation}
3219/// FOR i := 0 TO 3
3220/// j := i*64
3221/// k := (M >> i*2)[1:0] * 64
3222/// result[j+63:j] := V[k+63:k]
3223/// ENDFOR
3224/// \endcode
3225///
3226/// \headerfile <immintrin.h>
3227///
3228/// \code
3229/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3230/// \endcode
3231///
3232/// This intrinsic corresponds to the \c VPERMPD instruction.
3233///
3234/// \param V
3235/// A 256-bit vector of [4 x double] containing the source values.
3236/// \param M
3237/// An immediate 8-bit value specifying which elements to copy from \a V.
3238/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3239/// \a M[3:2] specifies the index for element 1, and so forth.
3240/// \returns A 256-bit vector of [4 x double] containing the result.
3241#define _mm256_permute4x64_pd(V, M) \
3242 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3243
3244/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3245/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3246/// the elements of the 256-bit vector of [8 x i32] in \a __b.
3247///
3248/// \code{.operation}
3249/// FOR i := 0 TO 7
3250/// j := i*32
3251/// k := __b[j+2:j] * 32
3252/// result[j+31:j] := __a[k+31:k]
3253/// ENDFOR
3254/// \endcode
3255///
3256/// \headerfile <immintrin.h>
3257///
3258/// This intrinsic corresponds to the \c VPERMPS instruction.
3259///
3260/// \param __a
3261/// A 256-bit vector of [8 x float] containing the source values.
3262/// \param __b
3263/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3264/// \a __a.
3265/// \returns A 256-bit vector of [8 x float] containing the result.
3266static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
3268 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3269}
3270
3271/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3272/// of the 256-bit vector of [4 x i64] in \a V as specified by the
3273/// immediate value \a M.
3274///
3275/// \code{.operation}
3276/// FOR i := 0 TO 3
3277/// j := i*64
3278/// k := (M >> i*2)[1:0] * 64
3279/// result[j+63:j] := V[k+63:k]
3280/// ENDFOR
3281/// \endcode
3282///
3283/// \headerfile <immintrin.h>
3284///
3285/// \code
3286/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3287/// \endcode
3288///
3289/// This intrinsic corresponds to the \c VPERMQ instruction.
3290///
3291/// \param V
3292/// A 256-bit vector of [4 x i64] containing the source values.
3293/// \param M
3294/// An immediate 8-bit value specifying which elements to copy from \a V.
3295/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3296/// \a M[3:2] specifies the index for element 1, and so forth.
3297/// \returns A 256-bit vector of [4 x i64] containing the result.
3298#define _mm256_permute4x64_epi64(V, M) \
3299 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3300
3301/// Sets each half of the 256-bit result either to zero or to one of the
3302/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3303/// as specified by the immediate value \a M.
3304///
3305/// \code{.operation}
3306/// FOR i := 0 TO 1
3307/// j := i*128
3308/// k := M >> (i*4)
3309/// IF k[3] == 0
3310/// CASE (k[1:0]) OF
3311/// 0: result[127+j:j] := V1[127:0]
3312/// 1: result[127+j:j] := V1[255:128]
3313/// 2: result[127+j:j] := V2[127:0]
3314/// 3: result[127+j:j] := V2[255:128]
3315/// ESAC
3316/// ELSE
3317/// result[127+j:j] := 0
3318/// FI
3319/// ENDFOR
3320/// \endcode
3321///
3322/// \headerfile <immintrin.h>
3323///
3324/// \code
3325/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3326/// \endcode
3327///
3328/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3329///
3330/// \param V1
3331/// A 256-bit integer vector containing source values.
3332/// \param V2
3333/// A 256-bit integer vector containing source values.
3334/// \param M
3335/// An immediate value specifying how to form the result. Bits [3:0]
3336/// control the lower half of the result, bits [7:4] control the upper half.
3337/// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3338/// otherwise bits [1:0] determine the source as follows. \n
3339/// 0: the lower half of \a V1 \n
3340/// 1: the upper half of \a V1 \n
3341/// 2: the lower half of \a V2 \n
3342/// 3: the upper half of \a V2
3343/// \returns A 256-bit integer vector containing the result.
3344#define _mm256_permute2x128_si256(V1, V2, M) \
3345 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3346
3347/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3348/// of the immediate \a M is zero, extracts the lower half of the result;
3349/// otherwise, extracts the upper half.
3350///
3351/// \headerfile <immintrin.h>
3352///
3353/// \code
3354/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3355/// \endcode
3356///
3357/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3358///
3359/// \param V
3360/// A 256-bit integer vector containing the source values.
3361/// \param M
3362/// An immediate value specifying which half of \a V to extract.
3363/// \returns A 128-bit integer vector containing the result.
3364#define _mm256_extracti128_si256(V, M) \
3365 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3366
3367/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3368/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3369/// is zero, overwrites the lower half of the result; otherwise,
3370/// overwrites the upper half.
3371///
3372/// \headerfile <immintrin.h>
3373///
3374/// \code
3375/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3376/// \endcode
3377///
3378/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3379///
3380/// \param V1
3381/// A 256-bit integer vector containing a source value.
3382/// \param V2
3383/// A 128-bit integer vector containing a source value.
3384/// \param M
3385/// An immediate value specifying where to put \a V2 in the result.
3386/// \returns A 256-bit integer vector containing the result.
3387#define _mm256_inserti128_si256(V1, V2, M) \
3388 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3389 (__v2di)(__m128i)(V2), (int)(M)))
3390
3391/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3392/// the most significant bit of the corresponding element in the mask
3393/// \a __M is set; otherwise, sets that element of the result to zero.
3394/// Returns the 256-bit [8 x i32] result.
3395///
3396/// \code{.operation}
3397/// FOR i := 0 TO 7
3398/// j := i*32
3399/// IF __M[j+31] == 1
3400/// result[j+31:j] := Load32(__X+(i*4))
3401/// ELSE
3402/// result[j+31:j] := 0
3403/// FI
3404/// ENDFOR
3405/// \endcode
3406///
3407/// \headerfile <immintrin.h>
3408///
3409/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3410///
3411/// \param __X
3412/// A pointer to the memory used for loading values.
3413/// \param __M
3414/// A 256-bit vector of [8 x i32] containing the mask bits.
3415/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3416/// elements.
3417static __inline__ __m256i __DEFAULT_FN_ATTRS256
3418_mm256_maskload_epi32(int const *__X, __m256i __M)
3419{
3420 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3421}
3422
3423/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3424/// the most significant bit of the corresponding element in the mask
3425/// \a __M is set; otherwise, sets that element of the result to zero.
3426/// Returns the 256-bit [4 x i64] result.
3427///
3428/// \code{.operation}
3429/// FOR i := 0 TO 3
3430/// j := i*64
3431/// IF __M[j+63] == 1
3432/// result[j+63:j] := Load64(__X+(i*8))
3433/// ELSE
3434/// result[j+63:j] := 0
3435/// FI
3436/// ENDFOR
3437/// \endcode
3438///
3439/// \headerfile <immintrin.h>
3440///
3441/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3442///
3443/// \param __X
3444/// A pointer to the memory used for loading values.
3445/// \param __M
3446/// A 256-bit vector of [4 x i64] containing the mask bits.
3447/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3448/// elements.
3449static __inline__ __m256i __DEFAULT_FN_ATTRS256
3450_mm256_maskload_epi64(long long const *__X, __m256i __M)
3451{
3452 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3453}
3454
3455/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3456/// the most significant bit of the corresponding element in the mask
3457/// \a __M is set; otherwise, sets that element of the result to zero.
3458/// Returns the 128-bit [4 x i32] result.
3459///
3460/// \code{.operation}
3461/// FOR i := 0 TO 3
3462/// j := i*32
3463/// IF __M[j+31] == 1
3464/// result[j+31:j] := Load32(__X+(i*4))
3465/// ELSE
3466/// result[j+31:j] := 0
3467/// FI
3468/// ENDFOR
3469/// \endcode
3470///
3471/// \headerfile <immintrin.h>
3472///
3473/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3474///
3475/// \param __X
3476/// A pointer to the memory used for loading values.
3477/// \param __M
3478/// A 128-bit vector of [4 x i32] containing the mask bits.
3479/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3480/// elements.
3481static __inline__ __m128i __DEFAULT_FN_ATTRS128
3482_mm_maskload_epi32(int const *__X, __m128i __M)
3483{
3484 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3485}
3486
3487/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3488/// the most significant bit of the corresponding element in the mask
3489/// \a __M is set; otherwise, sets that element of the result to zero.
3490/// Returns the 128-bit [2 x i64] result.
3491///
3492/// \code{.operation}
3493/// FOR i := 0 TO 1
3494/// j := i*64
3495/// IF __M[j+63] == 1
3496/// result[j+63:j] := Load64(__X+(i*8))
3497/// ELSE
3498/// result[j+63:j] := 0
3499/// FI
3500/// ENDFOR
3501/// \endcode
3502///
3503/// \headerfile <immintrin.h>
3504///
3505/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3506///
3507/// \param __X
3508/// A pointer to the memory used for loading values.
3509/// \param __M
3510/// A 128-bit vector of [2 x i64] containing the mask bits.
3511/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3512/// elements.
3513static __inline__ __m128i __DEFAULT_FN_ATTRS128
3514_mm_maskload_epi64(long long const *__X, __m128i __M)
3515{
3516 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3517}
3518
3519/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3520/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3521/// the corresponding element in the mask \a __M is set; otherwise, the
3522/// memory element is unchanged.
3523///
3524/// \code{.operation}
3525/// FOR i := 0 TO 7
3526/// j := i*32
3527/// IF __M[j+31] == 1
3528/// Store32(__X+(i*4), __Y[j+31:j])
3529/// FI
3530/// ENDFOR
3531/// \endcode
3532///
3533/// \headerfile <immintrin.h>
3534///
3535/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3536///
3537/// \param __X
3538/// A pointer to the memory used for storing values.
3539/// \param __M
3540/// A 256-bit vector of [8 x i32] containing the mask bits.
3541/// \param __Y
3542/// A 256-bit vector of [8 x i32] containing the values to store.
3543static __inline__ void __DEFAULT_FN_ATTRS256
3544_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3545{
3546 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3547}
3548
3549/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3550/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3551/// the corresponding element in the mask \a __M is set; otherwise, the
3552/// memory element is unchanged.
3553///
3554/// \code{.operation}
3555/// FOR i := 0 TO 3
3556/// j := i*64
3557/// IF __M[j+63] == 1
3558/// Store64(__X+(i*8), __Y[j+63:j])
3559/// FI
3560/// ENDFOR
3561/// \endcode
3562///
3563/// \headerfile <immintrin.h>
3564///
3565/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3566///
3567/// \param __X
3568/// A pointer to the memory used for storing values.
3569/// \param __M
3570/// A 256-bit vector of [4 x i64] containing the mask bits.
3571/// \param __Y
3572/// A 256-bit vector of [4 x i64] containing the values to store.
3573static __inline__ void __DEFAULT_FN_ATTRS256
3574_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3575{
3576 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3577}
3578
3579/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3580/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3581/// the corresponding element in the mask \a __M is set; otherwise, the
3582/// memory element is unchanged.
3583///
3584/// \code{.operation}
3585/// FOR i := 0 TO 3
3586/// j := i*32
3587/// IF __M[j+31] == 1
3588/// Store32(__X+(i*4), __Y[j+31:j])
3589/// FI
3590/// ENDFOR
3591/// \endcode
3592///
3593/// \headerfile <immintrin.h>
3594///
3595/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3596///
3597/// \param __X
3598/// A pointer to the memory used for storing values.
3599/// \param __M
3600/// A 128-bit vector of [4 x i32] containing the mask bits.
3601/// \param __Y
3602/// A 128-bit vector of [4 x i32] containing the values to store.
3603static __inline__ void __DEFAULT_FN_ATTRS128
3604_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3605{
3606 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3607}
3608
3609/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3610/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3611/// the corresponding element in the mask \a __M is set; otherwise, the
3612/// memory element is unchanged.
3613///
3614/// \code{.operation}
3615/// FOR i := 0 TO 1
3616/// j := i*64
3617/// IF __M[j+63] == 1
3618/// Store64(__X+(i*8), __Y[j+63:j])
3619/// FI
3620/// ENDFOR
3621/// \endcode
3622///
3623/// \headerfile <immintrin.h>
3624///
3625/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3626///
3627/// \param __X
3628/// A pointer to the memory used for storing values.
3629/// \param __M
3630/// A 128-bit vector of [2 x i64] containing the mask bits.
3631/// \param __Y
3632/// A 128-bit vector of [2 x i64] containing the values to store.
3633static __inline__ void __DEFAULT_FN_ATTRS128
3634_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3635{
3636 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3637}
3638
3639/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3640/// left by the number of bits given in the corresponding element of the
3641/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3642/// returns the result. If the shift count for any element is greater than
3643/// 31, the result for that element is zero.
3644///
3645/// \headerfile <immintrin.h>
3646///
3647/// This intrinsic corresponds to the \c VPSLLVD instruction.
3648///
3649/// \param __X
3650/// A 256-bit vector of [8 x i32] to be shifted.
3651/// \param __Y
3652/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3653/// bits).
3654/// \returns A 256-bit vector of [8 x i32] containing the result.
3655static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3656_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3657{
3658 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3659}
3660
3661/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3662/// left by the number of bits given in the corresponding element of the
3663/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3664/// returns the result. If the shift count for any element is greater than
3665/// 31, the result for that element is zero.
3666///
3667/// \headerfile <immintrin.h>
3668///
3669/// This intrinsic corresponds to the \c VPSLLVD instruction.
3670///
3671/// \param __X
3672/// A 128-bit vector of [4 x i32] to be shifted.
3673/// \param __Y
3674/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3675/// bits).
3676/// \returns A 128-bit vector of [4 x i32] containing the result.
3677static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3678_mm_sllv_epi32(__m128i __X, __m128i __Y)
3679{
3680 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3681}
3682
3683/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3684/// left by the number of bits given in the corresponding element of the
3685/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3686/// returns the result. If the shift count for any element is greater than
3687/// 63, the result for that element is zero.
3688///
3689/// \headerfile <immintrin.h>
3690///
3691/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3692///
3693/// \param __X
3694/// A 256-bit vector of [4 x i64] to be shifted.
3695/// \param __Y
3696/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3697/// bits).
3698/// \returns A 256-bit vector of [4 x i64] containing the result.
3699static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3700_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3701{
3702 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3703}
3704
3705/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3706/// left by the number of bits given in the corresponding element of the
3707/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3708/// returns the result. If the shift count for any element is greater than
3709/// 63, the result for that element is zero.
3710///
3711/// \headerfile <immintrin.h>
3712///
3713/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3714///
3715/// \param __X
3716/// A 128-bit vector of [2 x i64] to be shifted.
3717/// \param __Y
3718/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3719/// bits).
3720/// \returns A 128-bit vector of [2 x i64] containing the result.
3721static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3722_mm_sllv_epi64(__m128i __X, __m128i __Y)
3723{
3724 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3725}
3726
3727/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3728/// right by the number of bits given in the corresponding element of the
3729/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3730/// returns the result. If the shift count for any element is greater than
3731/// 31, the result for that element is 0 or -1 according to the sign bit
3732/// for that element.
3733///
3734/// \headerfile <immintrin.h>
3735///
3736/// This intrinsic corresponds to the \c VPSRAVD instruction.
3737///
3738/// \param __X
3739/// A 256-bit vector of [8 x i32] to be shifted.
3740/// \param __Y
3741/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3742/// bits).
3743/// \returns A 256-bit vector of [8 x i32] containing the result.
3744static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3745_mm256_srav_epi32(__m256i __X, __m256i __Y)
3746{
3747 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3748}
3749
3750/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3751/// right by the number of bits given in the corresponding element of the
3752/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3753/// returns the result. If the shift count for any element is greater than
3754/// 31, the result for that element is 0 or -1 according to the sign bit
3755/// for that element.
3756///
3757/// \headerfile <immintrin.h>
3758///
3759/// This intrinsic corresponds to the \c VPSRAVD instruction.
3760///
3761/// \param __X
3762/// A 128-bit vector of [4 x i32] to be shifted.
3763/// \param __Y
3764/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3765/// bits).
3766/// \returns A 128-bit vector of [4 x i32] containing the result.
3767static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3768_mm_srav_epi32(__m128i __X, __m128i __Y)
3769{
3770 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3771}
3772
3773/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3774/// right by the number of bits given in the corresponding element of the
3775/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3776/// returns the result. If the shift count for any element is greater than
3777/// 31, the result for that element is zero.
3778///
3779/// \headerfile <immintrin.h>
3780///
3781/// This intrinsic corresponds to the \c VPSRLVD instruction.
3782///
3783/// \param __X
3784/// A 256-bit vector of [8 x i32] to be shifted.
3785/// \param __Y
3786/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3787/// bits).
3788/// \returns A 256-bit vector of [8 x i32] containing the result.
3789static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3790_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3791{
3792 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3793}
3794
3795/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3796/// right by the number of bits given in the corresponding element of the
3797/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3798/// returns the result. If the shift count for any element is greater than
3799/// 31, the result for that element is zero.
3800///
3801/// \headerfile <immintrin.h>
3802///
3803/// This intrinsic corresponds to the \c VPSRLVD instruction.
3804///
3805/// \param __X
3806/// A 128-bit vector of [4 x i32] to be shifted.
3807/// \param __Y
3808/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3809/// bits).
3810/// \returns A 128-bit vector of [4 x i32] containing the result.
3811static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3812_mm_srlv_epi32(__m128i __X, __m128i __Y)
3813{
3814 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3815}
3816
3817/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3818/// right by the number of bits given in the corresponding element of the
3819/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3820/// returns the result. If the shift count for any element is greater than
3821/// 63, the result for that element is zero.
3822///
3823/// \headerfile <immintrin.h>
3824///
3825/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3826///
3827/// \param __X
3828/// A 256-bit vector of [4 x i64] to be shifted.
3829/// \param __Y
3830/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3831/// bits).
3832/// \returns A 256-bit vector of [4 x i64] containing the result.
3833static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3834_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3835{
3836 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3837}
3838
3839/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3840/// right by the number of bits given in the corresponding element of the
3841/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3842/// returns the result. If the shift count for any element is greater than
3843/// 63, the result for that element is zero.
3844///
3845/// \headerfile <immintrin.h>
3846///
3847/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3848///
3849/// \param __X
3850/// A 128-bit vector of [2 x i64] to be shifted.
3851/// \param __Y
3852/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3853/// bits).
3854/// \returns A 128-bit vector of [2 x i64] containing the result.
3855static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3856_mm_srlv_epi64(__m128i __X, __m128i __Y)
3857{
3858 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3859}
3860
3861/// Conditionally gathers two 64-bit floating-point values, either from the
3862/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3863/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3864/// of [2 x double] in \a mask determines the source for each element.
3865///
3866/// \code{.operation}
3867/// FOR element := 0 to 1
3868/// j := element*64
3869/// k := element*32
3870/// IF mask[j+63] == 0
3871/// result[j+63:j] := a[j+63:j]
3872/// ELSE
3873/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3874/// FI
3875/// ENDFOR
3876/// \endcode
3877///
3878/// \headerfile <immintrin.h>
3879///
3880/// \code
3881/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3882/// __m128d mask, const int s);
3883/// \endcode
3884///
3885/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3886///
3887/// \param a
3888/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3889/// zero.
3890/// \param m
3891/// A pointer to the memory used for loading values.
3892/// \param i
3893/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3894/// the first two elements are used.
3895/// \param mask
3896/// A 128-bit vector of [2 x double] containing the mask. The most
3897/// significant bit of each element in the mask vector represents the mask
3898/// bits. If a mask bit is zero, the corresponding value from vector \a a
3899/// is gathered; otherwise the value is loaded from memory.
3900/// \param s
3901/// A literal constant scale factor for the indexes in \a i. Must be
3902/// 1, 2, 4, or 8.
3903/// \returns A 128-bit vector of [2 x double] containing the gathered values.
3904#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
3905 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
3906 (double const *)(m), \
3907 (__v4si)(__m128i)(i), \
3908 (__v2df)(__m128d)(mask), (s)))
3909
3910/// Conditionally gathers four 64-bit floating-point values, either from the
3911/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
3912/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
3913/// of [4 x double] in \a mask determines the source for each element.
3914///
3915/// \code{.operation}
3916/// FOR element := 0 to 3
3917/// j := element*64
3918/// k := element*32
3919/// IF mask[j+63] == 0
3920/// result[j+63:j] := a[j+63:j]
3921/// ELSE
3922/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3923/// FI
3924/// ENDFOR
3925/// \endcode
3926///
3927/// \headerfile <immintrin.h>
3928///
3929/// \code
3930/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
3931/// __m256d mask, const int s);
3932/// \endcode
3933///
3934/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3935///
3936/// \param a
3937/// A 256-bit vector of [4 x double] used as the source when a mask bit is
3938/// zero.
3939/// \param m
3940/// A pointer to the memory used for loading values.
3941/// \param i
3942/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3943/// \param mask
3944/// A 256-bit vector of [4 x double] containing the mask. The most
3945/// significant bit of each element in the mask vector represents the mask
3946/// bits. If a mask bit is zero, the corresponding value from vector \a a
3947/// is gathered; otherwise the value is loaded from memory.
3948/// \param s
3949/// A literal constant scale factor for the indexes in \a i. Must be
3950/// 1, 2, 4, or 8.
3951/// \returns A 256-bit vector of [4 x double] containing the gathered values.
3952#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
3953 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
3954 (double const *)(m), \
3955 (__v4si)(__m128i)(i), \
3956 (__v4df)(__m256d)(mask), (s)))
3957
3958/// Conditionally gathers two 64-bit floating-point values, either from the
3959/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3960/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3961/// of [2 x double] in \a mask determines the source for each element.
3962///
3963/// \code{.operation}
3964/// FOR element := 0 to 1
3965/// j := element*64
3966/// k := element*64
3967/// IF mask[j+63] == 0
3968/// result[j+63:j] := a[j+63:j]
3969/// ELSE
3970/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3971/// FI
3972/// ENDFOR
3973/// \endcode
3974///
3975/// \headerfile <immintrin.h>
3976///
3977/// \code
3978/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
3979/// __m128d mask, const int s);
3980/// \endcode
3981///
3982/// This intrinsic corresponds to the \c VGATHERQPD instruction.
3983///
3984/// \param a
3985/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3986/// zero.
3987/// \param m
3988/// A pointer to the memory used for loading values.
3989/// \param i
3990/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3991/// \param mask
3992/// A 128-bit vector of [2 x double] containing the mask. The most
3993/// significant bit of each element in the mask vector represents the mask
3994/// bits. If a mask bit is zero, the corresponding value from vector \a a
3995/// is gathered; otherwise the value is loaded from memory.
3996/// \param s
3997/// A literal constant scale factor for the indexes in \a i. Must be
3998/// 1, 2, 4, or 8.
3999/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4000#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4001 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4002 (double const *)(m), \
4003 (__v2di)(__m128i)(i), \
4004 (__v2df)(__m128d)(mask), (s)))
4005
4006/// Conditionally gathers four 64-bit floating-point values, either from the
4007/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4008/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4009/// of [4 x double] in \a mask determines the source for each element.
4010///
4011/// \code{.operation}
4012/// FOR element := 0 to 3
4013/// j := element*64
4014/// k := element*64
4015/// IF mask[j+63] == 0
4016/// result[j+63:j] := a[j+63:j]
4017/// ELSE
4018/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4019/// FI
4020/// ENDFOR
4021/// \endcode
4022///
4023/// \headerfile <immintrin.h>
4024///
4025/// \code
4026/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4027/// __m256d mask, const int s);
4028/// \endcode
4029///
4030/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4031///
4032/// \param a
4033/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4034/// zero.
4035/// \param m
4036/// A pointer to the memory used for loading values.
4037/// \param i
4038/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4039/// \param mask
4040/// A 256-bit vector of [4 x double] containing the mask. The most
4041/// significant bit of each element in the mask vector represents the mask
4042/// bits. If a mask bit is zero, the corresponding value from vector \a a
4043/// is gathered; otherwise the value is loaded from memory.
4044/// \param s
4045/// A literal constant scale factor for the indexes in \a i. Must be
4046/// 1, 2, 4, or 8.
4047/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4048#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4049 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4050 (double const *)(m), \
4051 (__v4di)(__m256i)(i), \
4052 (__v4df)(__m256d)(mask), (s)))
4053
4054/// Conditionally gathers four 32-bit floating-point values, either from the
4055/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4056/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4057/// of [4 x float] in \a mask determines the source for each element.
4058///
4059/// \code{.operation}
4060/// FOR element := 0 to 3
4061/// j := element*32
4062/// k := element*32
4063/// IF mask[j+31] == 0
4064/// result[j+31:j] := a[j+31:j]
4065/// ELSE
4066/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4067/// FI
4068/// ENDFOR
4069/// \endcode
4070///
4071/// \headerfile <immintrin.h>
4072///
4073/// \code
4074/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4075/// __m128 mask, const int s);
4076/// \endcode
4077///
4078/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4079///
4080/// \param a
4081/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4082/// zero.
4083/// \param m
4084/// A pointer to the memory used for loading values.
4085/// \param i
4086/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4087/// \param mask
4088/// A 128-bit vector of [4 x float] containing the mask. The most
4089/// significant bit of each element in the mask vector represents the mask
4090/// bits. If a mask bit is zero, the corresponding value from vector \a a
4091/// is gathered; otherwise the value is loaded from memory.
4092/// \param s
4093/// A literal constant scale factor for the indexes in \a i. Must be
4094/// 1, 2, 4, or 8.
4095/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4096#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4097 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4098 (float const *)(m), \
4099 (__v4si)(__m128i)(i), \
4100 (__v4sf)(__m128)(mask), (s)))
4101
4102/// Conditionally gathers eight 32-bit floating-point values, either from the
4103/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4104/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4105/// of [8 x float] in \a mask determines the source for each element.
4106///
4107/// \code{.operation}
4108/// FOR element := 0 to 7
4109/// j := element*32
4110/// k := element*32
4111/// IF mask[j+31] == 0
4112/// result[j+31:j] := a[j+31:j]
4113/// ELSE
4114/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4115/// FI
4116/// ENDFOR
4117/// \endcode
4118///
4119/// \headerfile <immintrin.h>
4120///
4121/// \code
4122/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4123/// __m256 mask, const int s);
4124/// \endcode
4125///
4126/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4127///
4128/// \param a
4129/// A 256-bit vector of [8 x float] used as the source when a mask bit is
4130/// zero.
4131/// \param m
4132/// A pointer to the memory used for loading values.
4133/// \param i
4134/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4135/// \param mask
4136/// A 256-bit vector of [8 x float] containing the mask. The most
4137/// significant bit of each element in the mask vector represents the mask
4138/// bits. If a mask bit is zero, the corresponding value from vector \a a
4139/// is gathered; otherwise the value is loaded from memory.
4140/// \param s
4141/// A literal constant scale factor for the indexes in \a i. Must be
4142/// 1, 2, 4, or 8.
4143/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4144#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4145 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4146 (float const *)(m), \
4147 (__v8si)(__m256i)(i), \
4148 (__v8sf)(__m256)(mask), (s)))
4149
4150/// Conditionally gathers two 32-bit floating-point values, either from the
4151/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4152/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4153/// of [4 x float] in \a mask determines the source for the lower two
4154/// elements. The upper two elements of the result are zeroed.
4155///
4156/// \code{.operation}
4157/// FOR element := 0 to 1
4158/// j := element*32
4159/// k := element*64
4160/// IF mask[j+31] == 0
4161/// result[j+31:j] := a[j+31:j]
4162/// ELSE
4163/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4164/// FI
4165/// ENDFOR
4166/// result[127:64] := 0
4167/// \endcode
4168///
4169/// \headerfile <immintrin.h>
4170///
4171/// \code
4172/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4173/// __m128 mask, const int s);
4174/// \endcode
4175///
4176/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4177///
4178/// \param a
4179/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4180/// zero. Only the first two elements are used.
4181/// \param m
4182/// A pointer to the memory used for loading values.
4183/// \param i
4184/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4185/// \param mask
4186/// A 128-bit vector of [4 x float] containing the mask. The most
4187/// significant bit of each element in the mask vector represents the mask
4188/// bits. If a mask bit is zero, the corresponding value from vector \a a
4189/// is gathered; otherwise the value is loaded from memory. Only the first
4190/// two elements are used.
4191/// \param s
4192/// A literal constant scale factor for the indexes in \a i. Must be
4193/// 1, 2, 4, or 8.
4194/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4195#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4196 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4197 (float const *)(m), \
4198 (__v2di)(__m128i)(i), \
4199 (__v4sf)(__m128)(mask), (s)))
4200
4201/// Conditionally gathers four 32-bit floating-point values, either from the
4202/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4203/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4204/// of [4 x float] in \a mask determines the source for each element.
4205///
4206/// \code{.operation}
4207/// FOR element := 0 to 3
4208/// j := element*32
4209/// k := element*64
4210/// IF mask[j+31] == 0
4211/// result[j+31:j] := a[j+31:j]
4212/// ELSE
4213/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4214/// FI
4215/// ENDFOR
4216/// \endcode
4217///
4218/// \headerfile <immintrin.h>
4219///
4220/// \code
4221/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4222/// __m128 mask, const int s);
4223/// \endcode
4224///
4225/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4226///
4227/// \param a
4228/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4229/// zero.
4230/// \param m
4231/// A pointer to the memory used for loading values.
4232/// \param i
4233/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4234/// \param mask
4235/// A 128-bit vector of [4 x float] containing the mask. The most
4236/// significant bit of each element in the mask vector represents the mask
4237/// bits. If a mask bit is zero, the corresponding value from vector \a a
4238/// is gathered; otherwise the value is loaded from memory.
4239/// \param s
4240/// A literal constant scale factor for the indexes in \a i. Must be
4241/// 1, 2, 4, or 8.
4242/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4243#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4244 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4245 (float const *)(m), \
4246 (__v4di)(__m256i)(i), \
4247 (__v4sf)(__m128)(mask), (s)))
4248
4249/// Conditionally gathers four 32-bit integer values, either from the
4250/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4251/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4252/// of [4 x i32] in \a mask determines the source for each element.
4253///
4254/// \code{.operation}
4255/// FOR element := 0 to 3
4256/// j := element*32
4257/// k := element*32
4258/// IF mask[j+31] == 0
4259/// result[j+31:j] := a[j+31:j]
4260/// ELSE
4261/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4262/// FI
4263/// ENDFOR
4264/// \endcode
4265///
4266/// \headerfile <immintrin.h>
4267///
4268/// \code
4269/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4270/// __m128i mask, const int s);
4271/// \endcode
4272///
4273/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4274///
4275/// \param a
4276/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4277/// zero.
4278/// \param m
4279/// A pointer to the memory used for loading values.
4280/// \param i
4281/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4282/// \param mask
4283/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4284/// bit of each element in the mask vector represents the mask bits. If a
4285/// mask bit is zero, the corresponding value from vector \a a is gathered;
4286/// otherwise the value is loaded from memory.
4287/// \param s
4288/// A literal constant scale factor for the indexes in \a i. Must be
4289/// 1, 2, 4, or 8.
4290/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4291#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4292 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4293 (int const *)(m), \
4294 (__v4si)(__m128i)(i), \
4295 (__v4si)(__m128i)(mask), (s)))
4296
4297/// Conditionally gathers eight 32-bit integer values, either from the
4298/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4299/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4300/// of [8 x i32] in \a mask determines the source for each element.
4301///
4302/// \code{.operation}
4303/// FOR element := 0 to 7
4304/// j := element*32
4305/// k := element*32
4306/// IF mask[j+31] == 0
4307/// result[j+31:j] := a[j+31:j]
4308/// ELSE
4309/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4310/// FI
4311/// ENDFOR
4312/// \endcode
4313///
4314/// \headerfile <immintrin.h>
4315///
4316/// \code
4317/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4318/// __m256i mask, const int s);
4319/// \endcode
4320///
4321/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4322///
4323/// \param a
4324/// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4325/// zero.
4326/// \param m
4327/// A pointer to the memory used for loading values.
4328/// \param i
4329/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4330/// \param mask
4331/// A 256-bit vector of [8 x i32] containing the mask. The most significant
4332/// bit of each element in the mask vector represents the mask bits. If a
4333/// mask bit is zero, the corresponding value from vector \a a is gathered;
4334/// otherwise the value is loaded from memory.
4335/// \param s
4336/// A literal constant scale factor for the indexes in \a i. Must be
4337/// 1, 2, 4, or 8.
4338/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4339#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4340 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4341 (int const *)(m), \
4342 (__v8si)(__m256i)(i), \
4343 (__v8si)(__m256i)(mask), (s)))
4344
4345/// Conditionally gathers two 32-bit integer values, either from the
4346/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4347/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4348/// of [4 x i32] in \a mask determines the source for the lower two
4349/// elements. The upper two elements of the result are zeroed.
4350///
4351/// \code{.operation}
4352/// FOR element := 0 to 1
4353/// j := element*32
4354/// k := element*64
4355/// IF mask[j+31] == 0
4356/// result[j+31:j] := a[j+31:j]
4357/// ELSE
4358/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4359/// FI
4360/// ENDFOR
4361/// result[127:64] := 0
4362/// \endcode
4363///
4364/// \headerfile <immintrin.h>
4365///
4366/// \code
4367/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4368/// __m128i mask, const int s);
4369/// \endcode
4370///
4371/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4372///
4373/// \param a
4374/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4375/// zero. Only the first two elements are used.
4376/// \param m
4377/// A pointer to the memory used for loading values.
4378/// \param i
4379/// A 128-bit vector of [2 x i64] containing indexes into \a m.
4380/// \param mask
4381/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4382/// bit of each element in the mask vector represents the mask bits. If a
4383/// mask bit is zero, the corresponding value from vector \a a is gathered;
4384/// otherwise the value is loaded from memory. Only the first two elements
4385/// are used.
4386/// \param s
4387/// A literal constant scale factor for the indexes in \a i. Must be
4388/// 1, 2, 4, or 8.
4389/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4390#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4391 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4392 (int const *)(m), \
4393 (__v2di)(__m128i)(i), \
4394 (__v4si)(__m128i)(mask), (s)))
4395
4396/// Conditionally gathers four 32-bit integer values, either from the
4397/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4398/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4399/// of [4 x i32] in \a mask determines the source for each element.
4400///
4401/// \code{.operation}
4402/// FOR element := 0 to 3
4403/// j := element*32
4404/// k := element*64
4405/// IF mask[j+31] == 0
4406/// result[j+31:j] := a[j+31:j]
4407/// ELSE
4408/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4409/// FI
4410/// ENDFOR
4411/// \endcode
4412///
4413/// \headerfile <immintrin.h>
4414///
4415/// \code
4416/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4417/// __m128i mask, const int s);
4418/// \endcode
4419///
4420/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4421///
4422/// \param a
4423/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4424/// zero.
4425/// \param m
4426/// A pointer to the memory used for loading values.
4427/// \param i
4428/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4429/// \param mask
4430/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4431/// bit of each element in the mask vector represents the mask bits. If a
4432/// mask bit is zero, the corresponding value from vector \a a is gathered;
4433/// otherwise the value is loaded from memory.
4434/// \param s
4435/// A literal constant scale factor for the indexes in \a i. Must be
4436/// 1, 2, 4, or 8.
4437/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4438#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4439 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4440 (int const *)(m), \
4441 (__v4di)(__m256i)(i), \
4442 (__v4si)(__m128i)(mask), (s)))
4443
4444/// Conditionally gathers two 64-bit integer values, either from the
4445/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4446/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4447/// of [2 x i64] in \a mask determines the source for each element.
4448///
4449/// \code{.operation}
4450/// FOR element := 0 to 1
4451/// j := element*64
4452/// k := element*32
4453/// IF mask[j+63] == 0
4454/// result[j+63:j] := a[j+63:j]
4455/// ELSE
4456/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4457/// FI
4458/// ENDFOR
4459/// \endcode
4460///
4461/// \headerfile <immintrin.h>
4462///
4463/// \code
4464/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4465/// __m128i mask, const int s);
4466/// \endcode
4467///
4468/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4469///
4470/// \param a
4471/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4472/// zero.
4473/// \param m
4474/// A pointer to the memory used for loading values.
4475/// \param i
4476/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4477/// the first two elements are used.
4478/// \param mask
4479/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4480/// bit of each element in the mask vector represents the mask bits. If a
4481/// mask bit is zero, the corresponding value from vector \a a is gathered;
4482/// otherwise the value is loaded from memory.
4483/// \param s
4484/// A literal constant scale factor for the indexes in \a i. Must be
4485/// 1, 2, 4, or 8.
4486/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4487#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4488 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4489 (long long const *)(m), \
4490 (__v4si)(__m128i)(i), \
4491 (__v2di)(__m128i)(mask), (s)))
4492
4493/// Conditionally gathers four 64-bit integer values, either from the
4494/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4495/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4496/// of [4 x i64] in \a mask determines the source for each element.
4497///
4498/// \code{.operation}
4499/// FOR element := 0 to 3
4500/// j := element*64
4501/// k := element*32
4502/// IF mask[j+63] == 0
4503/// result[j+63:j] := a[j+63:j]
4504/// ELSE
4505/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4506/// FI
4507/// ENDFOR
4508/// \endcode
4509///
4510/// \headerfile <immintrin.h>
4511///
4512/// \code
4513/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4514/// __m128i i, __m256i mask, const int s);
4515/// \endcode
4516///
4517/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4518///
4519/// \param a
4520/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4521/// zero.
4522/// \param m
4523/// A pointer to the memory used for loading values.
4524/// \param i
4525/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4526/// \param mask
4527/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4528/// bit of each element in the mask vector represents the mask bits. If a
4529/// mask bit is zero, the corresponding value from vector \a a is gathered;
4530/// otherwise the value is loaded from memory.
4531/// \param s
4532/// A literal constant scale factor for the indexes in \a i. Must be
4533/// 1, 2, 4, or 8.
4534/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4535#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4536 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4537 (long long const *)(m), \
4538 (__v4si)(__m128i)(i), \
4539 (__v4di)(__m256i)(mask), (s)))
4540
4541/// Conditionally gathers two 64-bit integer values, either from the
4542/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4543/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4544/// of [2 x i64] in \a mask determines the source for each element.
4545///
4546/// \code{.operation}
4547/// FOR element := 0 to 1
4548/// j := element*64
4549/// k := element*64
4550/// IF mask[j+63] == 0
4551/// result[j+63:j] := a[j+63:j]
4552/// ELSE
4553/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4554/// FI
4555/// ENDFOR
4556/// \endcode
4557///
4558/// \headerfile <immintrin.h>
4559///
4560/// \code
4561/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4562/// __m128i mask, const int s);
4563/// \endcode
4564///
4565/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4566///
4567/// \param a
4568/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4569/// zero.
4570/// \param m
4571/// A pointer to the memory used for loading values.
4572/// \param i
4573/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4574/// \param mask
4575/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4576/// bit of each element in the mask vector represents the mask bits. If a
4577/// mask bit is zero, the corresponding value from vector \a a is gathered;
4578/// otherwise the value is loaded from memory.
4579/// \param s
4580/// A literal constant scale factor for the indexes in \a i. Must be
4581/// 1, 2, 4, or 8.
4582/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4583#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4584 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4585 (long long const *)(m), \
4586 (__v2di)(__m128i)(i), \
4587 (__v2di)(__m128i)(mask), (s)))
4588
4589/// Conditionally gathers four 64-bit integer values, either from the
4590/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4591/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4592/// of [4 x i64] in \a mask determines the source for each element.
4593///
4594/// \code{.operation}
4595/// FOR element := 0 to 3
4596/// j := element*64
4597/// k := element*64
4598/// IF mask[j+63] == 0
4599/// result[j+63:j] := a[j+63:j]
4600/// ELSE
4601/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4602/// FI
4603/// ENDFOR
4604/// \endcode
4605///
4606/// \headerfile <immintrin.h>
4607///
4608/// \code
4609/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4610/// __m256i i, __m256i mask, const int s);
4611/// \endcode
4612///
4613/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4614///
4615/// \param a
4616/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4617/// zero.
4618/// \param m
4619/// A pointer to the memory used for loading values.
4620/// \param i
4621/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4622/// \param mask
4623/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4624/// bit of each element in the mask vector represents the mask bits. If a
4625/// mask bit is zero, the corresponding value from vector \a a is gathered;
4626/// otherwise the value is loaded from memory.
4627/// \param s
4628/// A literal constant scale factor for the indexes in \a i. Must be
4629/// 1, 2, 4, or 8.
4630/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4631#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4632 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4633 (long long const *)(m), \
4634 (__v4di)(__m256i)(i), \
4635 (__v4di)(__m256i)(mask), (s)))
4636
4637/// Gathers two 64-bit floating-point values from memory \a m using scaled
4638/// indexes from the 128-bit vector of [4 x i32] in \a i.
4639///
4640/// \code{.operation}
4641/// FOR element := 0 to 1
4642/// j := element*64
4643/// k := element*32
4644/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4645/// ENDFOR
4646/// \endcode
4647///
4648/// \headerfile <immintrin.h>
4649///
4650/// \code
4651/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4652/// \endcode
4653///
4654/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4655///
4656/// \param m
4657/// A pointer to the memory used for loading values.
4658/// \param i
4659/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4660/// the first two elements are used.
4661/// \param s
4662/// A literal constant scale factor for the indexes in \a i. Must be
4663/// 1, 2, 4, or 8.
4664/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4665#define _mm_i32gather_pd(m, i, s) \
4666 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4667 (double const *)(m), \
4668 (__v4si)(__m128i)(i), \
4669 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4670 _mm_setzero_pd()), \
4671 (s)))
4672
4673/// Gathers four 64-bit floating-point values from memory \a m using scaled
4674/// indexes from the 128-bit vector of [4 x i32] in \a i.
4675///
4676/// \code{.operation}
4677/// FOR element := 0 to 3
4678/// j := element*64
4679/// k := element*32
4680/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4681/// ENDFOR
4682/// \endcode
4683///
4684/// \headerfile <immintrin.h>
4685///
4686/// \code
4687/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4688/// \endcode
4689///
4690/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4691///
4692/// \param m
4693/// A pointer to the memory used for loading values.
4694/// \param i
4695/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4696/// \param s
4697/// A literal constant scale factor for the indexes in \a i. Must be
4698/// 1, 2, 4, or 8.
4699/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4700#define _mm256_i32gather_pd(m, i, s) \
4701 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4702 (double const *)(m), \
4703 (__v4si)(__m128i)(i), \
4704 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4705 _mm256_setzero_pd(), \
4706 _CMP_EQ_OQ), \
4707 (s)))
4708
4709/// Gathers two 64-bit floating-point values from memory \a m using scaled
4710/// indexes from the 128-bit vector of [2 x i64] in \a i.
4711///
4712/// \code{.operation}
4713/// FOR element := 0 to 1
4714/// j := element*64
4715/// k := element*64
4716/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4717/// ENDFOR
4718/// \endcode
4719///
4720/// \headerfile <immintrin.h>
4721///
4722/// \code
4723/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4724/// \endcode
4725///
4726/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4727///
4728/// \param m
4729/// A pointer to the memory used for loading values.
4730/// \param i
4731/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4732/// \param s
4733/// A literal constant scale factor for the indexes in \a i. Must be
4734/// 1, 2, 4, or 8.
4735/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4736#define _mm_i64gather_pd(m, i, s) \
4737 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4738 (double const *)(m), \
4739 (__v2di)(__m128i)(i), \
4740 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4741 _mm_setzero_pd()), \
4742 (s)))
4743
4744/// Gathers four 64-bit floating-point values from memory \a m using scaled
4745/// indexes from the 256-bit vector of [4 x i64] in \a i.
4746///
4747/// \code{.operation}
4748/// FOR element := 0 to 3
4749/// j := element*64
4750/// k := element*64
4751/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4752/// ENDFOR
4753/// \endcode
4754///
4755/// \headerfile <immintrin.h>
4756///
4757/// \code
4758/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4759/// \endcode
4760///
4761/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4762///
4763/// \param m
4764/// A pointer to the memory used for loading values.
4765/// \param i
4766/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4767/// \param s
4768/// A literal constant scale factor for the indexes in \a i. Must be
4769/// 1, 2, 4, or 8.
4770/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4771#define _mm256_i64gather_pd(m, i, s) \
4772 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4773 (double const *)(m), \
4774 (__v4di)(__m256i)(i), \
4775 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4776 _mm256_setzero_pd(), \
4777 _CMP_EQ_OQ), \
4778 (s)))
4779
4780/// Gathers four 32-bit floating-point values from memory \a m using scaled
4781/// indexes from the 128-bit vector of [4 x i32] in \a i.
4782///
4783/// \code{.operation}
4784/// FOR element := 0 to 3
4785/// j := element*32
4786/// k := element*32
4787/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4788/// ENDFOR
4789/// \endcode
4790///
4791/// \headerfile <immintrin.h>
4792///
4793/// \code
4794/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4795/// \endcode
4796///
4797/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4798///
4799/// \param m
4800/// A pointer to the memory used for loading values.
4801/// \param i
4802/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4803/// \param s
4804/// A literal constant scale factor for the indexes in \a i. Must be
4805/// 1, 2, 4, or 8.
4806/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4807#define _mm_i32gather_ps(m, i, s) \
4808 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4809 (float const *)(m), \
4810 (__v4si)(__m128i)(i), \
4811 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4812 _mm_setzero_ps()), \
4813 (s)))
4814
4815/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4816/// indexes from the 256-bit vector of [8 x i32] in \a i.
4817///
4818/// \code{.operation}
4819/// FOR element := 0 to 7
4820/// j := element*32
4821/// k := element*32
4822/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4823/// ENDFOR
4824/// \endcode
4825///
4826/// \headerfile <immintrin.h>
4827///
4828/// \code
4829/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4830/// \endcode
4831///
4832/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4833///
4834/// \param m
4835/// A pointer to the memory used for loading values.
4836/// \param i
4837/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4838/// \param s
4839/// A literal constant scale factor for the indexes in \a i. Must be
4840/// 1, 2, 4, or 8.
4841/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4842#define _mm256_i32gather_ps(m, i, s) \
4843 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4844 (float const *)(m), \
4845 (__v8si)(__m256i)(i), \
4846 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4847 _mm256_setzero_ps(), \
4848 _CMP_EQ_OQ), \
4849 (s)))
4850
4851/// Gathers two 32-bit floating-point values from memory \a m using scaled
4852/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4853/// elements of the result are zeroed.
4854///
4855/// \code{.operation}
4856/// FOR element := 0 to 1
4857/// j := element*32
4858/// k := element*64
4859/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4860/// ENDFOR
4861/// result[127:64] := 0
4862/// \endcode
4863///
4864/// \headerfile <immintrin.h>
4865///
4866/// \code
4867/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4868/// \endcode
4869///
4870/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4871///
4872/// \param m
4873/// A pointer to the memory used for loading values.
4874/// \param i
4875/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4876/// \param s
4877/// A literal constant scale factor for the indexes in \a i. Must be
4878/// 1, 2, 4, or 8.
4879/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4880#define _mm_i64gather_ps(m, i, s) \
4881 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4882 (float const *)(m), \
4883 (__v2di)(__m128i)(i), \
4884 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4885 _mm_setzero_ps()), \
4886 (s)))
4887
4888/// Gathers four 32-bit floating-point values from memory \a m using scaled
4889/// indexes from the 256-bit vector of [4 x i64] in \a i.
4890///
4891/// \code{.operation}
4892/// FOR element := 0 to 3
4893/// j := element*32
4894/// k := element*64
4895/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4896/// ENDFOR
4897/// \endcode
4898///
4899/// \headerfile <immintrin.h>
4900///
4901/// \code
4902/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4903/// \endcode
4904///
4905/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4906///
4907/// \param m
4908/// A pointer to the memory used for loading values.
4909/// \param i
4910/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4911/// \param s
4912/// A literal constant scale factor for the indexes in \a i. Must be
4913/// 1, 2, 4, or 8.
4914/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4915#define _mm256_i64gather_ps(m, i, s) \
4916 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
4917 (float const *)(m), \
4918 (__v4di)(__m256i)(i), \
4919 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4920 _mm_setzero_ps()), \
4921 (s)))
4922
4923/// Gathers four 32-bit floating-point values from memory \a m using scaled
4924/// indexes from the 128-bit vector of [4 x i32] in \a i.
4925///
4926/// \code{.operation}
4927/// FOR element := 0 to 3
4928/// j := element*32
4929/// k := element*32
4930/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4931/// ENDFOR
4932/// \endcode
4933///
4934/// \headerfile <immintrin.h>
4935///
4936/// \code
4937/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
4938/// \endcode
4939///
4940/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4941///
4942/// \param m
4943/// A pointer to the memory used for loading values.
4944/// \param i
4945/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4946/// \param s
4947/// A literal constant scale factor for the indexes in \a i. Must be
4948/// 1, 2, 4, or 8.
4949/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4950#define _mm_i32gather_epi32(m, i, s) \
4951 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
4952 (int const *)(m), (__v4si)(__m128i)(i), \
4953 (__v4si)_mm_set1_epi32(-1), (s)))
4954
4955/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4956/// indexes from the 256-bit vector of [8 x i32] in \a i.
4957///
4958/// \code{.operation}
4959/// FOR element := 0 to 7
4960/// j := element*32
4961/// k := element*32
4962/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4963/// ENDFOR
4964/// \endcode
4965///
4966/// \headerfile <immintrin.h>
4967///
4968/// \code
4969/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
4970/// \endcode
4971///
4972/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4973///
4974/// \param m
4975/// A pointer to the memory used for loading values.
4976/// \param i
4977/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4978/// \param s
4979/// A literal constant scale factor for the indexes in \a i. Must be
4980/// 1, 2, 4, or 8.
4981/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4982#define _mm256_i32gather_epi32(m, i, s) \
4983 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
4984 (int const *)(m), (__v8si)(__m256i)(i), \
4985 (__v8si)_mm256_set1_epi32(-1), (s)))
4986
4987/// Gathers two 32-bit integer values from memory \a m using scaled indexes
4988/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
4989/// of the result are zeroed.
4990///
4991/// \code{.operation}
4992/// FOR element := 0 to 1
4993/// j := element*32
4994/// k := element*64
4995/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4996/// ENDFOR
4997/// result[127:64] := 0
4998/// \endcode
4999///
5000/// \headerfile <immintrin.h>
5001///
5002/// \code
5003/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5004/// \endcode
5005///
5006/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5007///
5008/// \param m
5009/// A pointer to the memory used for loading values.
5010/// \param i
5011/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5012/// \param s
5013/// A literal constant scale factor for the indexes in \a i. Must be
5014/// 1, 2, 4, or 8.
5015/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5016#define _mm_i64gather_epi32(m, i, s) \
5017 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5018 (int const *)(m), (__v2di)(__m128i)(i), \
5019 (__v4si)_mm_set1_epi32(-1), (s)))
5020
5021/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5022/// from the 256-bit vector of [4 x i64] in \a i.
5023///
5024/// \code{.operation}
5025/// FOR element := 0 to 3
5026/// j := element*32
5027/// k := element*64
5028/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5029/// ENDFOR
5030/// \endcode
5031///
5032/// \headerfile <immintrin.h>
5033///
5034/// \code
5035/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5036/// \endcode
5037///
5038/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5039///
5040/// \param m
5041/// A pointer to the memory used for loading values.
5042/// \param i
5043/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5044/// \param s
5045/// A literal constant scale factor for the indexes in \a i. Must be
5046/// 1, 2, 4, or 8.
5047/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5048#define _mm256_i64gather_epi32(m, i, s) \
5049 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5050 (int const *)(m), (__v4di)(__m256i)(i), \
5051 (__v4si)_mm_set1_epi32(-1), (s)))
5052
5053/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5054/// from the 128-bit vector of [4 x i32] in \a i.
5055///
5056/// \code{.operation}
5057/// FOR element := 0 to 1
5058/// j := element*64
5059/// k := element*32
5060/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5061/// ENDFOR
5062/// \endcode
5063///
5064/// \headerfile <immintrin.h>
5065///
5066/// \code
5067/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5068/// \endcode
5069///
5070/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5071///
5072/// \param m
5073/// A pointer to the memory used for loading values.
5074/// \param i
5075/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5076/// the first two elements are used.
5077/// \param s
5078/// A literal constant scale factor for the indexes in \a i. Must be
5079/// 1, 2, 4, or 8.
5080/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5081#define _mm_i32gather_epi64(m, i, s) \
5082 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5083 (long long const *)(m), \
5084 (__v4si)(__m128i)(i), \
5085 (__v2di)_mm_set1_epi64x(-1), (s)))
5086
5087/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5088/// from the 128-bit vector of [4 x i32] in \a i.
5089///
5090/// \code{.operation}
5091/// FOR element := 0 to 3
5092/// j := element*64
5093/// k := element*32
5094/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5095/// ENDFOR
5096/// \endcode
5097///
5098/// \headerfile <immintrin.h>
5099///
5100/// \code
5101/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5102/// \endcode
5103///
5104/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5105///
5106/// \param m
5107/// A pointer to the memory used for loading values.
5108/// \param i
5109/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5110/// \param s
5111/// A literal constant scale factor for the indexes in \a i. Must be
5112/// 1, 2, 4, or 8.
5113/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5114#define _mm256_i32gather_epi64(m, i, s) \
5115 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5116 (long long const *)(m), \
5117 (__v4si)(__m128i)(i), \
5118 (__v4di)_mm256_set1_epi64x(-1), (s)))
5119
5120/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5121/// from the 128-bit vector of [2 x i64] in \a i.
5122///
5123/// \code{.operation}
5124/// FOR element := 0 to 1
5125/// j := element*64
5126/// k := element*64
5127/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5128/// ENDFOR
5129/// \endcode
5130///
5131/// \headerfile <immintrin.h>
5132///
5133/// \code
5134/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5135/// \endcode
5136///
5137/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5138///
5139/// \param m
5140/// A pointer to the memory used for loading values.
5141/// \param i
5142/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5143/// \param s
5144/// A literal constant scale factor for the indexes in \a i. Must be
5145/// 1, 2, 4, or 8.
5146/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5147#define _mm_i64gather_epi64(m, i, s) \
5148 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5149 (long long const *)(m), \
5150 (__v2di)(__m128i)(i), \
5151 (__v2di)_mm_set1_epi64x(-1), (s)))
5152
5153/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5154/// from the 256-bit vector of [4 x i64] in \a i.
5155///
5156/// \code{.operation}
5157/// FOR element := 0 to 3
5158/// j := element*64
5159/// k := element*64
5160/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5161/// ENDFOR
5162/// \endcode
5163///
5164/// \headerfile <immintrin.h>
5165///
5166/// \code
5167/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5168/// \endcode
5169///
5170/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5171///
5172/// \param m
5173/// A pointer to the memory used for loading values.
5174/// \param i
5175/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5176/// \param s
5177/// A literal constant scale factor for the indexes in \a i. Must be
5178/// 1, 2, 4, or 8.
5179/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5180#define _mm256_i64gather_epi64(m, i, s) \
5181 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5182 (long long const *)(m), \
5183 (__v4di)(__m256i)(i), \
5184 (__v4di)_mm256_set1_epi64x(-1), (s)))
5185
5186#undef __DEFAULT_FN_ATTRS256
5187#undef __DEFAULT_FN_ATTRS128
5188#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
5189#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5190
5191#endif /* __AVX2INTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
Definition altivec.h:578
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi32_epi64(__m128i __V)
Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ int __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_movemask_epi8(__m256i __a)
Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vecto...
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256(const void *__V)
Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vec...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsubs_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than an...
Definition avx2intrin.h:722
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epu32(__m256i __a, __m256i __b)
Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srav_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_andnot_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit int...
Definition avx2intrin.h:466
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation...
Definition avx2intrin.h:386
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maddubs_epi16(__m256i __a, __m256i __b)
Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed by...
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x f...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64(long long const *__X, __m256i __M)
Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to mem...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epu16(__m256i __a, __m256i __b)
Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upp...
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastsd_pd(__m128d __a)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32(int const *__X, __m256i __M)
Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi16(__m128i __V)
Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsi...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packus_epi32(__m256i __V1, __m256i __V2)
Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation,...
Definition avx2intrin.h:261
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sra_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sll_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi32(__m256i __a)
Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a a...
Definition avx2intrin.h:139
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sll_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srav_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi64(__m128i __V)
Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and r...
Definition avx2intrin.h:670
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed sa...
Definition avx2intrin.h:368
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sign_epi32(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memor...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu32_epi64(__m128i __V)
Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi32(__m128i __V)
Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_shuffle_epi8(__m256i __a, __m256i __b)
Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi16(__m128i __V)
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the ...
Definition avx2intrin.h:551
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and r...
Definition avx2intrin.h:696
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32(int const *__X, __m128i __M)
Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srl_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsub_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and r...
Definition avx2intrin.h:938
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srl_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greate...
Definition avx2intrin.h:750
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsub_epi32(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and re...
Definition avx2intrin.h:969
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 b...
Definition avx2intrin.h:279
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi32(__m128i __V)
Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result'...
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadd_epi32(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and ret...
Definition avx2intrin.h:869
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sll_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits spec...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater...
Definition avx2intrin.h:776
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi32(__m256i __a, __m256i __b)
Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned satur...
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using sign...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns t...
Definition avx2intrin.h:618
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sign_epi16(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in _...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi8(__m256i __a)
Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each va...
Definition avx2intrin.h:107
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi64(__m128i __V)
Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi64(__m128i __V)
Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadds_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using ...
Definition avx2intrin.h:903
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi64(__m256i __a, __m256i __b)
Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the ...
Definition avx2intrin.h:333
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi64(__m128i __V)
Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_and_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vectors in __a and __b.
Definition avx2intrin.h:448
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsi128_si256(__m128i __X)
Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi32(__m256i __a, __m256i __b)
Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the ...
Definition avx2intrin.h:315
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sra_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srl_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packus_epi16(__m256i __a, __m256i __b)
Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation,...
Definition avx2intrin.h:230
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi16(__m256i __a)
Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a ...
Definition avx2intrin.h:123
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the...
Definition avx2intrin.h:297
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_or_si256(__m256i __a, __m256i __b)
Computes the bitwise OR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsd_pd(__m128d __X)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi32(__m128i __V)
Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadd_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and re...
Definition avx2intrin.h:838
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu16(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16...
Definition avx2intrin.h:517
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_xor_si256(__m256i __a, __m256i __b)
Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sign_epi8(__m256i __a, __m256i __b)
Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi32(__m128i __V)
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned ...
Definition avx2intrin.h:403
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64(long long const *__X, __m128i __M)
Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the c...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi64(__m256i __a, __m256i __b)
Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_madd_epi16(__m256i __a, __m256i __b)
Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit interme...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturat...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packs_epi32(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation,...
Definition avx2intrin.h:200
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater...
Definition avx2intrin.h:802
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a, __m256i __b)
Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers fr...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation,...
Definition avx2intrin.h:351
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and ...
Definition avx2intrin.h:644
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_packs_epi16(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation,...
Definition avx2intrin.h:169
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu8(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a a...
Definition avx2intrin.h:492
static __inline__ void int __a
Definition emmintrin.h:4077
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19