clang 20.0.0git
avx2intrin.h
Go to the documentation of this file.
1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX2INTRIN_H
15#define __AVX2INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
19#define __DEFAULT_FN_ATTRS256 \
20 __attribute__((__always_inline__, __nodebug__, \
21 __target__("avx2,no-evex512"), __min_vector_width__(256)))
22#define __DEFAULT_FN_ATTRS128 \
23 __attribute__((__always_inline__, __nodebug__, \
24 __target__("avx2,no-evex512"), __min_vector_width__(128)))
25#else
26#define __DEFAULT_FN_ATTRS256 \
27 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
28 __min_vector_width__(256)))
29#define __DEFAULT_FN_ATTRS128 \
30 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
31 __min_vector_width__(128)))
32#endif
33
34/* SSE4 Multiple Packed Sums of Absolute Difference. */
35/// Computes sixteen sum of absolute difference (SAD) operations on sets of
36/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
37/// \a Y.
38///
39/// Eight SAD results are computed using the lower half of the input
40/// vectors, and another eight using the upper half. These 16-bit values
41/// are returned in the lower and upper halves of the 256-bit result,
42/// respectively.
43///
44/// A single SAD operation selects four bytes from \a X and four bytes from
45/// \a Y as input. It computes the differences between each \a X byte and
46/// the corresponding \a Y byte, takes the absolute value of each
47/// difference, and sums these four values to form one 16-bit result. The
48/// intrinsic computes 16 of these results with different sets of input
49/// bytes.
50///
51/// For each set of eight results, the SAD operations use the same four
52/// bytes from \a Y; the starting bit position for these four bytes is
53/// specified by \a M[1:0] times 32. The eight operations use successive
54/// sets of four bytes from \a X; the starting bit position for the first
55/// set of four bytes is specified by \a M[2] times 32. These bit positions
56/// are all relative to the 128-bit lane for each set of eight operations.
57///
58/// \code{.operation}
59/// r := 0
60/// FOR i := 0 TO 1
61/// j := i*3
62/// Ybase := M[j+1:j]*32 + i*128
63/// Xbase := M[j+2]*32 + i*128
64/// FOR k := 0 TO 3
65/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
66/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
67/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
68/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
69/// result[r+15:r] := temp0 + temp1 + temp2 + temp3
70/// Xbase := Xbase + 8
71/// r := r + 16
72/// ENDFOR
73/// ENDFOR
74/// \endcode
75///
76/// \headerfile <immintrin.h>
77///
78/// \code
79/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
80/// \endcode
81///
82/// This intrinsic corresponds to the \c VMPSADBW instruction.
83///
84/// \param X
85/// A 256-bit integer vector containing one of the inputs.
86/// \param Y
87/// A 256-bit integer vector containing one of the inputs.
88/// \param M
89/// An unsigned immediate value specifying the starting positions of the
90/// bytes to operate on.
91/// \returns A 256-bit vector of [16 x i16] containing the result.
92#define _mm256_mpsadbw_epu8(X, Y, M) \
93 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
94 (__v32qi)(__m256i)(Y), (int)(M)))
95
96/// Computes the absolute value of each signed byte in the 256-bit integer
97/// vector \a __a and returns each value in the corresponding byte of
98/// the result.
99///
100/// \headerfile <immintrin.h>
101///
102/// This intrinsic corresponds to the \c VPABSB instruction.
103///
104/// \param __a
105/// A 256-bit integer vector.
106/// \returns A 256-bit integer vector containing the result.
107static __inline__ __m256i __DEFAULT_FN_ATTRS256
109{
110 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
111}
112
113/// Computes the absolute value of each signed 16-bit element in the 256-bit
114/// vector of [16 x i16] in \a __a and returns each value in the
115/// corresponding element of the result.
116///
117/// \headerfile <immintrin.h>
118///
119/// This intrinsic corresponds to the \c VPABSW instruction.
120///
121/// \param __a
122/// A 256-bit vector of [16 x i16].
123/// \returns A 256-bit vector of [16 x i16] containing the result.
124static __inline__ __m256i __DEFAULT_FN_ATTRS256
126{
127 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
128}
129
130/// Computes the absolute value of each signed 32-bit element in the 256-bit
131/// vector of [8 x i32] in \a __a and returns each value in the
132/// corresponding element of the result.
133///
134/// \headerfile <immintrin.h>
135///
136/// This intrinsic corresponds to the \c VPABSD instruction.
137///
138/// \param __a
139/// A 256-bit vector of [8 x i32].
140/// \returns A 256-bit vector of [8 x i32] containing the result.
141static __inline__ __m256i __DEFAULT_FN_ATTRS256
143{
144 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
145}
146
147/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
148/// integers using signed saturation, and returns the 256-bit result.
149///
150/// \code{.operation}
151/// FOR i := 0 TO 7
152/// j := i*16
153/// k := i*8
154/// result[7+k:k] := SATURATE8(__a[15+j:j])
155/// result[71+k:64+k] := SATURATE8(__b[15+j:j])
156/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
157/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
158/// ENDFOR
159/// \endcode
160///
161/// \headerfile <immintrin.h>
162///
163/// This intrinsic corresponds to the \c VPACKSSWB instruction.
164///
165/// \param __a
166/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
167/// result[191:128].
168/// \param __b
169/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
170/// result[255:192].
171/// \returns A 256-bit integer vector containing the result.
172static __inline__ __m256i __DEFAULT_FN_ATTRS256
173_mm256_packs_epi16(__m256i __a, __m256i __b)
174{
175 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
176}
177
178/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
179/// integers using signed saturation, and returns the resulting 256-bit
180/// vector of [16 x i16].
181///
182/// \code{.operation}
183/// FOR i := 0 TO 3
184/// j := i*32
185/// k := i*16
186/// result[15+k:k] := SATURATE16(__a[31+j:j])
187/// result[79+k:64+k] := SATURATE16(__b[31+j:j])
188/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
189/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
190/// ENDFOR
191/// \endcode
192///
193/// \headerfile <immintrin.h>
194///
195/// This intrinsic corresponds to the \c VPACKSSDW instruction.
196///
197/// \param __a
198/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
199/// result[191:128].
200/// \param __b
201/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
202/// result[255:192].
203/// \returns A 256-bit vector of [16 x i16] containing the result.
204static __inline__ __m256i __DEFAULT_FN_ATTRS256
205_mm256_packs_epi32(__m256i __a, __m256i __b)
206{
207 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
208}
209
210/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
211/// using unsigned saturation, and returns the 256-bit result.
212///
213/// \code{.operation}
214/// FOR i := 0 TO 7
215/// j := i*16
216/// k := i*8
217/// result[7+k:k] := SATURATE8U(__a[15+j:j])
218/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
219/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
220/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
221/// ENDFOR
222/// \endcode
223///
224/// \headerfile <immintrin.h>
225///
226/// This intrinsic corresponds to the \c VPACKUSWB instruction.
227///
228/// \param __a
229/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
230/// result[191:128].
231/// \param __b
232/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
233/// result[255:192].
234/// \returns A 256-bit integer vector containing the result.
235static __inline__ __m256i __DEFAULT_FN_ATTRS256
236_mm256_packus_epi16(__m256i __a, __m256i __b)
237{
238 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
239}
240
241/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
242/// using unsigned saturation, and returns the resulting 256-bit vector of
243/// [16 x i16].
244///
245/// \code{.operation}
246/// FOR i := 0 TO 3
247/// j := i*32
248/// k := i*16
249/// result[15+k:k] := SATURATE16U(__V1[31+j:j])
250/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
251/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
252/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
253/// ENDFOR
254/// \endcode
255///
256/// \headerfile <immintrin.h>
257///
258/// This intrinsic corresponds to the \c VPACKUSDW instruction.
259///
260/// \param __V1
261/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
262/// result[191:128].
263/// \param __V2
264/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
265/// result[255:192].
266/// \returns A 256-bit vector of [16 x i16] containing the result.
267static __inline__ __m256i __DEFAULT_FN_ATTRS256
268_mm256_packus_epi32(__m256i __V1, __m256i __V2)
269{
270 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
271}
272
273/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
274/// vectors and returns the lower 8 bits of each sum in the corresponding
275/// byte of the 256-bit integer vector result (overflow is ignored).
276///
277/// \headerfile <immintrin.h>
278///
279/// This intrinsic corresponds to the \c VPADDB instruction.
280///
281/// \param __a
282/// A 256-bit integer vector containing one of the source operands.
283/// \param __b
284/// A 256-bit integer vector containing one of the source operands.
285/// \returns A 256-bit integer vector containing the sums.
286static __inline__ __m256i __DEFAULT_FN_ATTRS256
287_mm256_add_epi8(__m256i __a, __m256i __b)
288{
289 return (__m256i)((__v32qu)__a + (__v32qu)__b);
290}
291
292/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
293/// [16 x i16] and returns the lower 16 bits of each sum in the
294/// corresponding element of the [16 x i16] result (overflow is ignored).
295///
296/// \headerfile <immintrin.h>
297///
298/// This intrinsic corresponds to the \c VPADDW instruction.
299///
300/// \param __a
301/// A 256-bit vector of [16 x i16] containing one of the source operands.
302/// \param __b
303/// A 256-bit vector of [16 x i16] containing one of the source operands.
304/// \returns A 256-bit vector of [16 x i16] containing the sums.
305static __inline__ __m256i __DEFAULT_FN_ATTRS256
306_mm256_add_epi16(__m256i __a, __m256i __b)
307{
308 return (__m256i)((__v16hu)__a + (__v16hu)__b);
309}
310
311/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
312/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
313/// element of the [8 x i32] result (overflow is ignored).
314///
315/// \headerfile <immintrin.h>
316///
317/// This intrinsic corresponds to the \c VPADDD instruction.
318///
319/// \param __a
320/// A 256-bit vector of [8 x i32] containing one of the source operands.
321/// \param __b
322/// A 256-bit vector of [8 x i32] containing one of the source operands.
323/// \returns A 256-bit vector of [8 x i32] containing the sums.
324static __inline__ __m256i __DEFAULT_FN_ATTRS256
325_mm256_add_epi32(__m256i __a, __m256i __b)
326{
327 return (__m256i)((__v8su)__a + (__v8su)__b);
328}
329
330/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
331/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
332/// element of the [4 x i64] result (overflow is ignored).
333///
334/// \headerfile <immintrin.h>
335///
336/// This intrinsic corresponds to the \c VPADDQ instruction.
337///
338/// \param __a
339/// A 256-bit vector of [4 x i64] containing one of the source operands.
340/// \param __b
341/// A 256-bit vector of [4 x i64] containing one of the source operands.
342/// \returns A 256-bit vector of [4 x i64] containing the sums.
343static __inline__ __m256i __DEFAULT_FN_ATTRS256
344_mm256_add_epi64(__m256i __a, __m256i __b)
345{
346 return (__m256i)((__v4du)__a + (__v4du)__b);
347}
348
349/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
350/// vectors using signed saturation, and returns each sum in the
351/// corresponding byte of the 256-bit integer vector result.
352///
353/// \headerfile <immintrin.h>
354///
355/// This intrinsic corresponds to the \c VPADDSB instruction.
356///
357/// \param __a
358/// A 256-bit integer vector containing one of the source operands.
359/// \param __b
360/// A 256-bit integer vector containing one of the source operands.
361/// \returns A 256-bit integer vector containing the sums.
362static __inline__ __m256i __DEFAULT_FN_ATTRS256
363_mm256_adds_epi8(__m256i __a, __m256i __b)
364{
365 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
366}
367
368/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
369/// [16 x i16] using signed saturation, and returns the [16 x i16] result.
370///
371/// \headerfile <immintrin.h>
372///
373/// This intrinsic corresponds to the \c VPADDSW instruction.
374///
375/// \param __a
376/// A 256-bit vector of [16 x i16] containing one of the source operands.
377/// \param __b
378/// A 256-bit vector of [16 x i16] containing one of the source operands.
379/// \returns A 256-bit vector of [16 x i16] containing the sums.
380static __inline__ __m256i __DEFAULT_FN_ATTRS256
381_mm256_adds_epi16(__m256i __a, __m256i __b)
382{
383 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
384}
385
386/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
387/// vectors using unsigned saturation, and returns each sum in the
388/// corresponding byte of the 256-bit integer vector result.
389///
390/// \headerfile <immintrin.h>
391///
392/// This intrinsic corresponds to the \c VPADDUSB instruction.
393///
394/// \param __a
395/// A 256-bit integer vector containing one of the source operands.
396/// \param __b
397/// A 256-bit integer vector containing one of the source operands.
398/// \returns A 256-bit integer vector containing the sums.
399static __inline__ __m256i __DEFAULT_FN_ATTRS256
400_mm256_adds_epu8(__m256i __a, __m256i __b)
401{
402 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
403}
404
405/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
406/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
407///
408/// \headerfile <immintrin.h>
409///
410/// This intrinsic corresponds to the \c VPADDUSW instruction.
411///
412/// \param __a
413/// A 256-bit vector of [16 x i16] containing one of the source operands.
414/// \param __b
415/// A 256-bit vector of [16 x i16] containing one of the source operands.
416/// \returns A 256-bit vector of [16 x i16] containing the sums.
417static __inline__ __m256i __DEFAULT_FN_ATTRS256
418_mm256_adds_epu16(__m256i __a, __m256i __b)
419{
420 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
421}
422
423/// Uses the lower half of the 256-bit vector \a a as the upper half of a
424/// temporary 256-bit value, and the lower half of the 256-bit vector \a b
425/// as the lower half of the temporary value. Right-shifts the temporary
426/// value by \a n bytes, and uses the lower 16 bytes of the shifted value
427/// as the lower 16 bytes of the result. Uses the upper halves of \a a and
428/// \a b to make another temporary value, right shifts by \a n, and uses
429/// the lower 16 bytes of the shifted value as the upper 16 bytes of the
430/// result.
431///
432/// \headerfile <immintrin.h>
433///
434/// \code
435/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
436/// \endcode
437///
438/// This intrinsic corresponds to the \c VPALIGNR instruction.
439///
440/// \param a
441/// A 256-bit integer vector containing source values.
442/// \param b
443/// A 256-bit integer vector containing source values.
444/// \param n
445/// An immediate value specifying the number of bytes to shift.
446/// \returns A 256-bit integer vector containing the result.
447#define _mm256_alignr_epi8(a, b, n) \
448 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
449 (__v32qi)(__m256i)(b), (n)))
450
451/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
452/// \a __b.
453///
454/// \headerfile <immintrin.h>
455///
456/// This intrinsic corresponds to the \c VPAND instruction.
457///
458/// \param __a
459/// A 256-bit integer vector.
460/// \param __b
461/// A 256-bit integer vector.
462/// \returns A 256-bit integer vector containing the result.
463static __inline__ __m256i __DEFAULT_FN_ATTRS256
464_mm256_and_si256(__m256i __a, __m256i __b)
465{
466 return (__m256i)((__v4du)__a & (__v4du)__b);
467}
468
469/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
470/// the bitwise NOT of the 256-bit integer vector in \a __a.
471///
472/// \headerfile <immintrin.h>
473///
474/// This intrinsic corresponds to the \c VPANDN instruction.
475///
476/// \param __a
477/// A 256-bit integer vector.
478/// \param __b
479/// A 256-bit integer vector.
480/// \returns A 256-bit integer vector containing the result.
481static __inline__ __m256i __DEFAULT_FN_ATTRS256
482_mm256_andnot_si256(__m256i __a, __m256i __b)
483{
484 return (__m256i)(~(__v4du)__a & (__v4du)__b);
485}
486
487/// Computes the averages of the corresponding unsigned bytes in the two
488/// 256-bit integer vectors in \a __a and \a __b and returns each
489/// average in the corresponding byte of the 256-bit result.
490///
491/// \code{.operation}
492/// FOR i := 0 TO 31
493/// j := i*8
494/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
495/// ENDFOR
496/// \endcode
497///
498/// \headerfile <immintrin.h>
499///
500/// This intrinsic corresponds to the \c VPAVGB instruction.
501///
502/// \param __a
503/// A 256-bit integer vector.
504/// \param __b
505/// A 256-bit integer vector.
506/// \returns A 256-bit integer vector containing the result.
507static __inline__ __m256i __DEFAULT_FN_ATTRS256
508_mm256_avg_epu8(__m256i __a, __m256i __b)
509{
510 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
511}
512
513/// Computes the averages of the corresponding unsigned 16-bit integers in
514/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
515/// each average in the corresponding element of the 256-bit result.
516///
517/// \code{.operation}
518/// FOR i := 0 TO 15
519/// j := i*16
520/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
521/// ENDFOR
522/// \endcode
523///
524/// \headerfile <immintrin.h>
525///
526/// This intrinsic corresponds to the \c VPAVGW instruction.
527///
528/// \param __a
529/// A 256-bit vector of [16 x i16].
530/// \param __b
531/// A 256-bit vector of [16 x i16].
532/// \returns A 256-bit vector of [16 x i16] containing the result.
533static __inline__ __m256i __DEFAULT_FN_ATTRS256
534_mm256_avg_epu16(__m256i __a, __m256i __b)
535{
536 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
537}
538
539/// Merges 8-bit integer values from either of the two 256-bit vectors
540/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
541/// the resulting 256-bit integer vector.
542///
543/// \code{.operation}
544/// FOR i := 0 TO 31
545/// j := i*8
546/// IF __M[7+i] == 0
547/// result[7+j:j] := __V1[7+j:j]
548/// ELSE
549/// result[7+j:j] := __V2[7+j:j]
550/// FI
551/// ENDFOR
552/// \endcode
553///
554/// \headerfile <immintrin.h>
555///
556/// This intrinsic corresponds to the \c VPBLENDVB instruction.
557///
558/// \param __V1
559/// A 256-bit integer vector containing source values.
560/// \param __V2
561/// A 256-bit integer vector containing source values.
562/// \param __M
563/// A 256-bit integer vector, with bit [7] of each byte specifying the
564/// source for each corresponding byte of the result. When the mask bit
565/// is 0, the byte is copied from \a __V1; otherwise, it is copied from
566/// \a __V2.
567/// \returns A 256-bit integer vector containing the result.
568static __inline__ __m256i __DEFAULT_FN_ATTRS256
569_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
570{
571 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
572 (__v32qi)__M);
573}
574
575/// Merges 16-bit integer values from either of the two 256-bit vectors
576/// \a V1 or \a V2, as specified by the immediate integer operand \a M,
577/// and returns the resulting 256-bit vector of [16 x i16].
578///
579/// \code{.operation}
580/// FOR i := 0 TO 7
581/// j := i*16
582/// IF M[i] == 0
583/// result[7+j:j] := V1[7+j:j]
584/// result[135+j:128+j] := V1[135+j:128+j]
585/// ELSE
586/// result[7+j:j] := V2[7+j:j]
587/// result[135+j:128+j] := V2[135+j:128+j]
588/// FI
589/// ENDFOR
590/// \endcode
591///
592/// \headerfile <immintrin.h>
593///
594/// \code
595/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
596/// \endcode
597///
598/// This intrinsic corresponds to the \c VPBLENDW instruction.
599///
600/// \param V1
601/// A 256-bit vector of [16 x i16] containing source values.
602/// \param V2
603/// A 256-bit vector of [16 x i16] containing source values.
604/// \param M
605/// An immediate 8-bit integer operand, with bits [7:0] specifying the
606/// source for each element of the result. The position of the mask bit
607/// corresponds to the index of a copied value. When a mask bit is 0, the
608/// element is copied from \a V1; otherwise, it is copied from \a V2.
609/// \a M[0] determines the source for elements 0 and 8, \a M[1] for
610/// elements 1 and 9, and so forth.
611/// \returns A 256-bit vector of [16 x i16] containing the result.
612#define _mm256_blend_epi16(V1, V2, M) \
613 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
614 (__v16hi)(__m256i)(V2), (int)(M)))
615
616/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
617/// \a __b for equality and returns the outcomes in the corresponding
618/// bytes of the 256-bit result.
619///
620/// \code{.operation}
621/// FOR i := 0 TO 31
622/// j := i*8
623/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
624/// ENDFOR
625/// \endcode
626///
627/// \headerfile <immintrin.h>
628///
629/// This intrinsic corresponds to the \c VPCMPEQB instruction.
630///
631/// \param __a
632/// A 256-bit integer vector containing one of the inputs.
633/// \param __b
634/// A 256-bit integer vector containing one of the inputs.
635/// \returns A 256-bit integer vector containing the result.
636static __inline__ __m256i __DEFAULT_FN_ATTRS256
637_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
638{
639 return (__m256i)((__v32qi)__a == (__v32qi)__b);
640}
641
642/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
643/// \a __a and \a __b for equality and returns the outcomes in the
644/// corresponding elements of the 256-bit result.
645///
646/// \code{.operation}
647/// FOR i := 0 TO 15
648/// j := i*16
649/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
650/// ENDFOR
651/// \endcode
652///
653/// \headerfile <immintrin.h>
654///
655/// This intrinsic corresponds to the \c VPCMPEQW instruction.
656///
657/// \param __a
658/// A 256-bit vector of [16 x i16] containing one of the inputs.
659/// \param __b
660/// A 256-bit vector of [16 x i16] containing one of the inputs.
661/// \returns A 256-bit vector of [16 x i16] containing the result.
662static __inline__ __m256i __DEFAULT_FN_ATTRS256
663_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
664{
665 return (__m256i)((__v16hi)__a == (__v16hi)__b);
666}
667
668/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
669/// \a __a and \a __b for equality and returns the outcomes in the
670/// corresponding elements of the 256-bit result.
671///
672/// \code{.operation}
673/// FOR i := 0 TO 7
674/// j := i*32
675/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
676/// ENDFOR
677/// \endcode
678///
679/// \headerfile <immintrin.h>
680///
681/// This intrinsic corresponds to the \c VPCMPEQD instruction.
682///
683/// \param __a
684/// A 256-bit vector of [8 x i32] containing one of the inputs.
685/// \param __b
686/// A 256-bit vector of [8 x i32] containing one of the inputs.
687/// \returns A 256-bit vector of [8 x i32] containing the result.
688static __inline__ __m256i __DEFAULT_FN_ATTRS256
689_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
690{
691 return (__m256i)((__v8si)__a == (__v8si)__b);
692}
693
694/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
695/// \a __a and \a __b for equality and returns the outcomes in the
696/// corresponding elements of the 256-bit result.
697///
698/// \code{.operation}
699/// FOR i := 0 TO 3
700/// j := i*64
701/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
702/// ENDFOR
703/// \endcode
704///
705/// \headerfile <immintrin.h>
706///
707/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
708///
709/// \param __a
710/// A 256-bit vector of [4 x i64] containing one of the inputs.
711/// \param __b
712/// A 256-bit vector of [4 x i64] containing one of the inputs.
713/// \returns A 256-bit vector of [4 x i64] containing the result.
714static __inline__ __m256i __DEFAULT_FN_ATTRS256
715_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
716{
717 return (__m256i)((__v4di)__a == (__v4di)__b);
718}
719
720/// Compares corresponding signed bytes in the 256-bit integer vectors in
721/// \a __a and \a __b for greater-than and returns the outcomes in the
722/// corresponding bytes of the 256-bit result.
723///
724/// \code{.operation}
725/// FOR i := 0 TO 31
726/// j := i*8
727/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
728/// ENDFOR
729/// \endcode
730///
731/// \headerfile <immintrin.h>
732///
733/// This intrinsic corresponds to the \c VPCMPGTB instruction.
734///
735/// \param __a
736/// A 256-bit integer vector containing one of the inputs.
737/// \param __b
738/// A 256-bit integer vector containing one of the inputs.
739/// \returns A 256-bit integer vector containing the result.
740static __inline__ __m256i __DEFAULT_FN_ATTRS256
741_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
742{
743 /* This function always performs a signed comparison, but __v32qi is a char
744 which may be signed or unsigned, so use __v32qs. */
745 return (__m256i)((__v32qs)__a > (__v32qs)__b);
746}
747
748/// Compares corresponding signed elements in the 256-bit vectors of
749/// [16 x i16] in \a __a and \a __b for greater-than and returns the
750/// outcomes in the corresponding elements of the 256-bit result.
751///
752/// \code{.operation}
753/// FOR i := 0 TO 15
754/// j := i*16
755/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
756/// ENDFOR
757/// \endcode
758///
759/// \headerfile <immintrin.h>
760///
761/// This intrinsic corresponds to the \c VPCMPGTW instruction.
762///
763/// \param __a
764/// A 256-bit vector of [16 x i16] containing one of the inputs.
765/// \param __b
766/// A 256-bit vector of [16 x i16] containing one of the inputs.
767/// \returns A 256-bit vector of [16 x i16] containing the result.
768static __inline__ __m256i __DEFAULT_FN_ATTRS256
769_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
770{
771 return (__m256i)((__v16hi)__a > (__v16hi)__b);
772}
773
774/// Compares corresponding signed elements in the 256-bit vectors of
775/// [8 x i32] in \a __a and \a __b for greater-than and returns the
776/// outcomes in the corresponding elements of the 256-bit result.
777///
778/// \code{.operation}
779/// FOR i := 0 TO 7
780/// j := i*32
781/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
782/// ENDFOR
783/// \endcode
784///
785/// \headerfile <immintrin.h>
786///
787/// This intrinsic corresponds to the \c VPCMPGTD instruction.
788///
789/// \param __a
790/// A 256-bit vector of [8 x i32] containing one of the inputs.
791/// \param __b
792/// A 256-bit vector of [8 x i32] containing one of the inputs.
793/// \returns A 256-bit vector of [8 x i32] containing the result.
794static __inline__ __m256i __DEFAULT_FN_ATTRS256
795_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
796{
797 return (__m256i)((__v8si)__a > (__v8si)__b);
798}
799
800/// Compares corresponding signed elements in the 256-bit vectors of
801/// [4 x i64] in \a __a and \a __b for greater-than and returns the
802/// outcomes in the corresponding elements of the 256-bit result.
803///
804/// \code{.operation}
805/// FOR i := 0 TO 3
806/// j := i*64
807/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
808/// ENDFOR
809/// \endcode
810///
811/// \headerfile <immintrin.h>
812///
813/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
814///
815/// \param __a
816/// A 256-bit vector of [4 x i64] containing one of the inputs.
817/// \param __b
818/// A 256-bit vector of [4 x i64] containing one of the inputs.
819/// \returns A 256-bit vector of [4 x i64] containing the result.
820static __inline__ __m256i __DEFAULT_FN_ATTRS256
821_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
822{
823 return (__m256i)((__v4di)__a > (__v4di)__b);
824}
825
826/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
827/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
828/// element of the [16 x i16] result (overflow is ignored). Sums from
829/// \a __a are returned in the lower 64 bits of each 128-bit half of the
830/// result; sums from \a __b are returned in the upper 64 bits of each
831/// 128-bit half of the result.
832///
833/// \code{.operation}
834/// FOR i := 0 TO 1
835/// j := i*128
836/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
837/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
838/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
839/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
840/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
841/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
842/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
843/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
844/// ENDFOR
845/// \endcode
846///
847/// \headerfile <immintrin.h>
848///
849/// This intrinsic corresponds to the \c VPHADDW instruction.
850///
851/// \param __a
852/// A 256-bit vector of [16 x i16] containing one of the source operands.
853/// \param __b
854/// A 256-bit vector of [16 x i16] containing one of the source operands.
855/// \returns A 256-bit vector of [16 x i16] containing the sums.
856static __inline__ __m256i __DEFAULT_FN_ATTRS256
857_mm256_hadd_epi16(__m256i __a, __m256i __b)
858{
859 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
860}
861
862/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
863/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
864/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
865/// are returned in the lower 64 bits of each 128-bit half of the result;
866/// sums from \a __b are returned in the upper 64 bits of each 128-bit half
867/// of the result.
868///
869/// \code{.operation}
870/// FOR i := 0 TO 1
871/// j := i*128
872/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
873/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
874/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
875/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
876/// ENDFOR
877/// \endcode
878///
879/// \headerfile <immintrin.h>
880///
881/// This intrinsic corresponds to the \c VPHADDD instruction.
882///
883/// \param __a
884/// A 256-bit vector of [8 x i32] containing one of the source operands.
885/// \param __b
886/// A 256-bit vector of [8 x i32] containing one of the source operands.
887/// \returns A 256-bit vector of [8 x i32] containing the sums.
888static __inline__ __m256i __DEFAULT_FN_ATTRS256
889_mm256_hadd_epi32(__m256i __a, __m256i __b)
890{
891 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
892}
893
894/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
895/// vectors of [16 x i16] using signed saturation and returns each sum in
896/// an element of the [16 x i16] result. Sums from \a __a are returned in
897/// the lower 64 bits of each 128-bit half of the result; sums from \a __b
898/// are returned in the upper 64 bits of each 128-bit half of the result.
899///
900/// \code{.operation}
901/// FOR i := 0 TO 1
902/// j := i*128
903/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
904/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
905/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
906/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
907/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
908/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
909/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
910/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
911/// ENDFOR
912/// \endcode
913///
914/// \headerfile <immintrin.h>
915///
916/// This intrinsic corresponds to the \c VPHADDSW instruction.
917///
918/// \param __a
919/// A 256-bit vector of [16 x i16] containing one of the source operands.
920/// \param __b
921/// A 256-bit vector of [16 x i16] containing one of the source operands.
922/// \returns A 256-bit vector of [16 x i16] containing the sums.
923static __inline__ __m256i __DEFAULT_FN_ATTRS256
924_mm256_hadds_epi16(__m256i __a, __m256i __b)
925{
926 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
927}
928
929/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
930/// vectors of [16 x i16] and returns the lower 16 bits of each difference
931/// in an element of the [16 x i16] result (overflow is ignored).
932/// Differences from \a __a are returned in the lower 64 bits of each
933/// 128-bit half of the result; differences from \a __b are returned in the
934/// upper 64 bits of each 128-bit half of the result.
935///
936/// \code{.operation}
937/// FOR i := 0 TO 1
938/// j := i*128
939/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
940/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
941/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
942/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
943/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
944/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
945/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
946/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
947/// ENDFOR
948/// \endcode
949///
950/// \headerfile <immintrin.h>
951///
952/// This intrinsic corresponds to the \c VPHSUBW instruction.
953///
954/// \param __a
955/// A 256-bit vector of [16 x i16] containing one of the source operands.
956/// \param __b
957/// A 256-bit vector of [16 x i16] containing one of the source operands.
958/// \returns A 256-bit vector of [16 x i16] containing the differences.
959static __inline__ __m256i __DEFAULT_FN_ATTRS256
960_mm256_hsub_epi16(__m256i __a, __m256i __b)
961{
962 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
963}
964
965/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
966/// vectors of [8 x i32] and returns the lower 32 bits of each difference in
967/// an element of the [8 x i32] result (overflow is ignored). Differences
968/// from \a __a are returned in the lower 64 bits of each 128-bit half of
969/// the result; differences from \a __b are returned in the upper 64 bits
970/// of each 128-bit half of the result.
971///
972/// \code{.operation}
973/// FOR i := 0 TO 1
974/// j := i*128
975/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
976/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
977/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
978/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
979/// ENDFOR
980/// \endcode
981///
982/// \headerfile <immintrin.h>
983///
984/// This intrinsic corresponds to the \c VPHSUBD instruction.
985///
986/// \param __a
987/// A 256-bit vector of [8 x i32] containing one of the source operands.
988/// \param __b
989/// A 256-bit vector of [8 x i32] containing one of the source operands.
990/// \returns A 256-bit vector of [8 x i32] containing the differences.
991static __inline__ __m256i __DEFAULT_FN_ATTRS256
992_mm256_hsub_epi32(__m256i __a, __m256i __b)
993{
994 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
995}
996
997/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
998/// vectors of [16 x i16] using signed saturation and returns each sum in
999/// an element of the [16 x i16] result. Differences from \a __a are
1000/// returned in the lower 64 bits of each 128-bit half of the result;
1001/// differences from \a __b are returned in the upper 64 bits of each
1002/// 128-bit half of the result.
1003///
1004/// \code{.operation}
1005/// FOR i := 0 TO 1
1006/// j := i*128
1007/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
1008/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1009/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1010/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1011/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1012/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1013/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1014/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1015/// ENDFOR
1016/// \endcode
1017///
1018/// \headerfile <immintrin.h>
1019///
1020/// This intrinsic corresponds to the \c VPHSUBSW instruction.
1021///
1022/// \param __a
1023/// A 256-bit vector of [16 x i16] containing one of the source operands.
1024/// \param __b
1025/// A 256-bit vector of [16 x i16] containing one of the source operands.
1026/// \returns A 256-bit vector of [16 x i16] containing the differences.
1027static __inline__ __m256i __DEFAULT_FN_ATTRS256
1028_mm256_hsubs_epi16(__m256i __a, __m256i __b)
1029{
1030 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1031}
1032
1033/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1034/// with the corresponding signed byte from the 256-bit integer vector in
1035/// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1036/// pairs of those products using signed saturation to form 16-bit sums
1037/// returned as elements of the [16 x i16] result.
1038///
1039/// \code{.operation}
1040/// FOR i := 0 TO 15
1041/// j := i*16
1042/// temp1 := __a[j+7:j] * __b[j+7:j]
1043/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1044/// result[j+15:j] := SATURATE16(temp1 + temp2)
1045/// ENDFOR
1046/// \endcode
1047///
1048/// \headerfile <immintrin.h>
1049///
1050/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1051///
1052/// \param __a
1053/// A 256-bit vector containing one of the source operands.
1054/// \param __b
1055/// A 256-bit vector containing one of the source operands.
1056/// \returns A 256-bit vector of [16 x i16] containing the result.
1057static __inline__ __m256i __DEFAULT_FN_ATTRS256
1059{
1060 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1061}
1062
1063/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1064/// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1065/// those products to form 32-bit sums returned as elements of the
1066/// [8 x i32] result.
1067///
1068/// There is only one wraparound case: when all four of the 16-bit sources
1069/// are \c 0x8000, the result will be \c 0x80000000.
1070///
1071/// \code{.operation}
1072/// FOR i := 0 TO 7
1073/// j := i*32
1074/// temp1 := __a[j+15:j] * __b[j+15:j]
1075/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1076/// result[j+31:j] := temp1 + temp2
1077/// ENDFOR
1078/// \endcode
1079///
1080/// \headerfile <immintrin.h>
1081///
1082/// This intrinsic corresponds to the \c VPMADDWD instruction.
1083///
1084/// \param __a
1085/// A 256-bit vector of [16 x i16] containing one of the source operands.
1086/// \param __b
1087/// A 256-bit vector of [16 x i16] containing one of the source operands.
1088/// \returns A 256-bit vector of [8 x i32] containing the result.
1089static __inline__ __m256i __DEFAULT_FN_ATTRS256
1090_mm256_madd_epi16(__m256i __a, __m256i __b)
1091{
1092 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1093}
1094
1095/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1096/// in \a __a and \a __b and returns the larger of each pair in the
1097/// corresponding byte of the 256-bit result.
1098///
1099/// \headerfile <immintrin.h>
1100///
1101/// This intrinsic corresponds to the \c VPMAXSB instruction.
1102///
1103/// \param __a
1104/// A 256-bit integer vector.
1105/// \param __b
1106/// A 256-bit integer vector.
1107/// \returns A 256-bit integer vector containing the result.
1108static __inline__ __m256i __DEFAULT_FN_ATTRS256
1109_mm256_max_epi8(__m256i __a, __m256i __b)
1110{
1111 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1112}
1113
1114/// Compares the corresponding signed 16-bit integers in the two 256-bit
1115/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1116/// each pair in the corresponding element of the 256-bit result.
1117///
1118/// \headerfile <immintrin.h>
1119///
1120/// This intrinsic corresponds to the \c VPMAXSW instruction.
1121///
1122/// \param __a
1123/// A 256-bit vector of [16 x i16].
1124/// \param __b
1125/// A 256-bit vector of [16 x i16].
1126/// \returns A 256-bit vector of [16 x i16] containing the result.
1127static __inline__ __m256i __DEFAULT_FN_ATTRS256
1128_mm256_max_epi16(__m256i __a, __m256i __b)
1129{
1130 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1131}
1132
1133/// Compares the corresponding signed 32-bit integers in the two 256-bit
1134/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1135/// each pair in the corresponding element of the 256-bit result.
1136///
1137/// \headerfile <immintrin.h>
1138///
1139/// This intrinsic corresponds to the \c VPMAXSD instruction.
1140///
1141/// \param __a
1142/// A 256-bit vector of [8 x i32].
1143/// \param __b
1144/// A 256-bit vector of [8 x i32].
1145/// \returns A 256-bit vector of [8 x i32] containing the result.
1146static __inline__ __m256i __DEFAULT_FN_ATTRS256
1147_mm256_max_epi32(__m256i __a, __m256i __b)
1148{
1149 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1150}
1151
1152/// Compares the corresponding unsigned bytes in the two 256-bit integer
1153/// vectors in \a __a and \a __b and returns the larger of each pair in
1154/// the corresponding byte of the 256-bit result.
1155///
1156/// \headerfile <immintrin.h>
1157///
1158/// This intrinsic corresponds to the \c VPMAXUB instruction.
1159///
1160/// \param __a
1161/// A 256-bit integer vector.
1162/// \param __b
1163/// A 256-bit integer vector.
1164/// \returns A 256-bit integer vector containing the result.
1165static __inline__ __m256i __DEFAULT_FN_ATTRS256
1166_mm256_max_epu8(__m256i __a, __m256i __b)
1167{
1168 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1169}
1170
1171/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1172/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1173/// each pair in the corresponding element of the 256-bit result.
1174///
1175/// \headerfile <immintrin.h>
1176///
1177/// This intrinsic corresponds to the \c VPMAXUW instruction.
1178///
1179/// \param __a
1180/// A 256-bit vector of [16 x i16].
1181/// \param __b
1182/// A 256-bit vector of [16 x i16].
1183/// \returns A 256-bit vector of [16 x i16] containing the result.
1184static __inline__ __m256i __DEFAULT_FN_ATTRS256
1185_mm256_max_epu16(__m256i __a, __m256i __b)
1186{
1187 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1188}
1189
1190/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1191/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1192/// each pair in the corresponding element of the 256-bit result.
1193///
1194/// \headerfile <immintrin.h>
1195///
1196/// This intrinsic corresponds to the \c VPMAXUD instruction.
1197///
1198/// \param __a
1199/// A 256-bit vector of [8 x i32].
1200/// \param __b
1201/// A 256-bit vector of [8 x i32].
1202/// \returns A 256-bit vector of [8 x i32] containing the result.
1203static __inline__ __m256i __DEFAULT_FN_ATTRS256
1204_mm256_max_epu32(__m256i __a, __m256i __b)
1205{
1206 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1207}
1208
1209/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1210/// in \a __a and \a __b and returns the smaller of each pair in the
1211/// corresponding byte of the 256-bit result.
1212///
1213/// \headerfile <immintrin.h>
1214///
1215/// This intrinsic corresponds to the \c VPMINSB instruction.
1216///
1217/// \param __a
1218/// A 256-bit integer vector.
1219/// \param __b
1220/// A 256-bit integer vector.
1221/// \returns A 256-bit integer vector containing the result.
1222static __inline__ __m256i __DEFAULT_FN_ATTRS256
1223_mm256_min_epi8(__m256i __a, __m256i __b)
1224{
1225 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1226}
1227
1228/// Compares the corresponding signed 16-bit integers in the two 256-bit
1229/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1230/// each pair in the corresponding element of the 256-bit result.
1231///
1232/// \headerfile <immintrin.h>
1233///
1234/// This intrinsic corresponds to the \c VPMINSW instruction.
1235///
1236/// \param __a
1237/// A 256-bit vector of [16 x i16].
1238/// \param __b
1239/// A 256-bit vector of [16 x i16].
1240/// \returns A 256-bit vector of [16 x i16] containing the result.
1241static __inline__ __m256i __DEFAULT_FN_ATTRS256
1242_mm256_min_epi16(__m256i __a, __m256i __b)
1243{
1244 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1245}
1246
1247/// Compares the corresponding signed 32-bit integers in the two 256-bit
1248/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1249/// each pair in the corresponding element of the 256-bit result.
1250///
1251/// \headerfile <immintrin.h>
1252///
1253/// This intrinsic corresponds to the \c VPMINSD instruction.
1254///
1255/// \param __a
1256/// A 256-bit vector of [8 x i32].
1257/// \param __b
1258/// A 256-bit vector of [8 x i32].
1259/// \returns A 256-bit vector of [8 x i32] containing the result.
1260static __inline__ __m256i __DEFAULT_FN_ATTRS256
1261_mm256_min_epi32(__m256i __a, __m256i __b)
1262{
1263 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1264}
1265
1266/// Compares the corresponding unsigned bytes in the two 256-bit integer
1267/// vectors in \a __a and \a __b and returns the smaller of each pair in
1268/// the corresponding byte of the 256-bit result.
1269///
1270/// \headerfile <immintrin.h>
1271///
1272/// This intrinsic corresponds to the \c VPMINUB instruction.
1273///
1274/// \param __a
1275/// A 256-bit integer vector.
1276/// \param __b
1277/// A 256-bit integer vector.
1278/// \returns A 256-bit integer vector containing the result.
1279static __inline__ __m256i __DEFAULT_FN_ATTRS256
1280_mm256_min_epu8(__m256i __a, __m256i __b)
1281{
1282 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1283}
1284
1285/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1286/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1287/// each pair in the corresponding element of the 256-bit result.
1288///
1289/// \headerfile <immintrin.h>
1290///
1291/// This intrinsic corresponds to the \c VPMINUW instruction.
1292///
1293/// \param __a
1294/// A 256-bit vector of [16 x i16].
1295/// \param __b
1296/// A 256-bit vector of [16 x i16].
1297/// \returns A 256-bit vector of [16 x i16] containing the result.
1298static __inline__ __m256i __DEFAULT_FN_ATTRS256
1299_mm256_min_epu16(__m256i __a, __m256i __b)
1300{
1301 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1302}
1303
1304/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1305/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1306/// each pair in the corresponding element of the 256-bit result.
1307///
1308/// \headerfile <immintrin.h>
1309///
1310/// This intrinsic corresponds to the \c VPMINUD instruction.
1311///
1312/// \param __a
1313/// A 256-bit vector of [8 x i32].
1314/// \param __b
1315/// A 256-bit vector of [8 x i32].
1316/// \returns A 256-bit vector of [8 x i32] containing the result.
1317static __inline__ __m256i __DEFAULT_FN_ATTRS256
1318_mm256_min_epu32(__m256i __a, __m256i __b)
1319{
1320 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1321}
1322
1323/// Creates a 32-bit integer mask from the most significant bit of each byte
1324/// in the 256-bit integer vector in \a __a and returns the result.
1325///
1326/// \code{.operation}
1327/// FOR i := 0 TO 31
1328/// j := i*8
1329/// result[i] := __a[j+7]
1330/// ENDFOR
1331/// \endcode
1332///
1333/// \headerfile <immintrin.h>
1334///
1335/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1336///
1337/// \param __a
1338/// A 256-bit integer vector containing the source bytes.
1339/// \returns The 32-bit integer mask.
1340static __inline__ int __DEFAULT_FN_ATTRS256
1342{
1343 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1344}
1345
1346/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1347/// the 16-bit values in the corresponding elements of a 256-bit vector
1348/// of [16 x i16].
1349///
1350/// \code{.operation}
1351/// FOR i := 0 TO 15
1352/// j := i*8
1353/// k := i*16
1354/// result[k+15:k] := SignExtend(__V[j+7:j])
1355/// ENDFOR
1356/// \endcode
1357///
1358/// \headerfile <immintrin.h>
1359///
1360/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1361///
1362/// \param __V
1363/// A 128-bit integer vector containing the source bytes.
1364/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1365/// values.
1366static __inline__ __m256i __DEFAULT_FN_ATTRS256
1368{
1369 /* This function always performs a signed extension, but __v16qi is a char
1370 which may be signed or unsigned, so use __v16qs. */
1371 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1372}
1373
1374/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1375/// \a __V and returns the 32-bit values in the corresponding elements of a
1376/// 256-bit vector of [8 x i32].
1377///
1378/// \code{.operation}
1379/// FOR i := 0 TO 7
1380/// j := i*8
1381/// k := i*32
1382/// result[k+31:k] := SignExtend(__V[j+7:j])
1383/// ENDFOR
1384/// \endcode
1385///
1386/// \headerfile <immintrin.h>
1387///
1388/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1389///
1390/// \param __V
1391/// A 128-bit integer vector containing the source bytes.
1392/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1393/// values.
1394static __inline__ __m256i __DEFAULT_FN_ATTRS256
1396{
1397 /* This function always performs a signed extension, but __v16qi is a char
1398 which may be signed or unsigned, so use __v16qs. */
1399 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1400}
1401
1402/// Sign-extends the first four bytes from the 128-bit integer vector in
1403/// \a __V and returns the 64-bit values in the corresponding elements of a
1404/// 256-bit vector of [4 x i64].
1405///
1406/// \code{.operation}
1407/// result[63:0] := SignExtend(__V[7:0])
1408/// result[127:64] := SignExtend(__V[15:8])
1409/// result[191:128] := SignExtend(__V[23:16])
1410/// result[255:192] := SignExtend(__V[31:24])
1411/// \endcode
1412///
1413/// \headerfile <immintrin.h>
1414///
1415/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1416///
1417/// \param __V
1418/// A 128-bit integer vector containing the source bytes.
1419/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1420/// values.
1421static __inline__ __m256i __DEFAULT_FN_ATTRS256
1423{
1424 /* This function always performs a signed extension, but __v16qi is a char
1425 which may be signed or unsigned, so use __v16qs. */
1426 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1427}
1428
1429/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1430/// \a __V and returns the 32-bit values in the corresponding elements of a
1431/// 256-bit vector of [8 x i32].
1432///
1433/// \code{.operation}
1434/// FOR i := 0 TO 7
1435/// j := i*16
1436/// k := i*32
1437/// result[k+31:k] := SignExtend(__V[j+15:j])
1438/// ENDFOR
1439/// \endcode
1440///
1441/// \headerfile <immintrin.h>
1442///
1443/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1444///
1445/// \param __V
1446/// A 128-bit vector of [8 x i16] containing the source values.
1447/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1448/// values.
1449static __inline__ __m256i __DEFAULT_FN_ATTRS256
1451{
1452 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1453}
1454
1455/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1456/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1457/// elements of a 256-bit vector of [4 x i64].
1458///
1459/// \code{.operation}
1460/// result[63:0] := SignExtend(__V[15:0])
1461/// result[127:64] := SignExtend(__V[31:16])
1462/// result[191:128] := SignExtend(__V[47:32])
1463/// result[255:192] := SignExtend(__V[64:48])
1464/// \endcode
1465///
1466/// \headerfile <immintrin.h>
1467///
1468/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1469///
1470/// \param __V
1471/// A 128-bit vector of [8 x i16] containing the source values.
1472/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1473/// values.
1474static __inline__ __m256i __DEFAULT_FN_ATTRS256
1476{
1477 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1478}
1479
1480/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1481/// \a __V and returns the 64-bit values in the corresponding elements of a
1482/// 256-bit vector of [4 x i64].
1483///
1484/// \code{.operation}
1485/// result[63:0] := SignExtend(__V[31:0])
1486/// result[127:64] := SignExtend(__V[63:32])
1487/// result[191:128] := SignExtend(__V[95:64])
1488/// result[255:192] := SignExtend(__V[127:96])
1489/// \endcode
1490///
1491/// \headerfile <immintrin.h>
1492///
1493/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1494///
1495/// \param __V
1496/// A 128-bit vector of [4 x i32] containing the source values.
1497/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1498/// values.
1499static __inline__ __m256i __DEFAULT_FN_ATTRS256
1501{
1502 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1503}
1504
1505/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1506/// the 16-bit values in the corresponding elements of a 256-bit vector
1507/// of [16 x i16].
1508///
1509/// \code{.operation}
1510/// FOR i := 0 TO 15
1511/// j := i*8
1512/// k := i*16
1513/// result[k+15:k] := ZeroExtend(__V[j+7:j])
1514/// ENDFOR
1515/// \endcode
1516///
1517/// \headerfile <immintrin.h>
1518///
1519/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1520///
1521/// \param __V
1522/// A 128-bit integer vector containing the source bytes.
1523/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1524/// values.
1525static __inline__ __m256i __DEFAULT_FN_ATTRS256
1527{
1528 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1529}
1530
1531/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1532/// \a __V and returns the 32-bit values in the corresponding elements of a
1533/// 256-bit vector of [8 x i32].
1534///
1535/// \code{.operation}
1536/// FOR i := 0 TO 7
1537/// j := i*8
1538/// k := i*32
1539/// result[k+31:k] := ZeroExtend(__V[j+7:j])
1540/// ENDFOR
1541/// \endcode
1542///
1543/// \headerfile <immintrin.h>
1544///
1545/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1546///
1547/// \param __V
1548/// A 128-bit integer vector containing the source bytes.
1549/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1550/// values.
1551static __inline__ __m256i __DEFAULT_FN_ATTRS256
1553{
1554 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1555}
1556
1557/// Zero-extends the first four bytes from the 128-bit integer vector in
1558/// \a __V and returns the 64-bit values in the corresponding elements of a
1559/// 256-bit vector of [4 x i64].
1560///
1561/// \code{.operation}
1562/// result[63:0] := ZeroExtend(__V[7:0])
1563/// result[127:64] := ZeroExtend(__V[15:8])
1564/// result[191:128] := ZeroExtend(__V[23:16])
1565/// result[255:192] := ZeroExtend(__V[31:24])
1566/// \endcode
1567///
1568/// \headerfile <immintrin.h>
1569///
1570/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1571///
1572/// \param __V
1573/// A 128-bit integer vector containing the source bytes.
1574/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1575/// values.
1576static __inline__ __m256i __DEFAULT_FN_ATTRS256
1578{
1579 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1580}
1581
1582/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1583/// \a __V and returns the 32-bit values in the corresponding elements of a
1584/// 256-bit vector of [8 x i32].
1585///
1586/// \code{.operation}
1587/// FOR i := 0 TO 7
1588/// j := i*16
1589/// k := i*32
1590/// result[k+31:k] := ZeroExtend(__V[j+15:j])
1591/// ENDFOR
1592/// \endcode
1593///
1594/// \headerfile <immintrin.h>
1595///
1596/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1597///
1598/// \param __V
1599/// A 128-bit vector of [8 x i16] containing the source values.
1600/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1601/// values.
1602static __inline__ __m256i __DEFAULT_FN_ATTRS256
1604{
1605 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1606}
1607
1608/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1609/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1610/// elements of a 256-bit vector of [4 x i64].
1611///
1612/// \code{.operation}
1613/// result[63:0] := ZeroExtend(__V[15:0])
1614/// result[127:64] := ZeroExtend(__V[31:16])
1615/// result[191:128] := ZeroExtend(__V[47:32])
1616/// result[255:192] := ZeroExtend(__V[64:48])
1617/// \endcode
1618///
1619/// \headerfile <immintrin.h>
1620///
1621/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1622///
1623/// \param __V
1624/// A 128-bit vector of [8 x i16] containing the source values.
1625/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1626/// values.
1627static __inline__ __m256i __DEFAULT_FN_ATTRS256
1629{
1630 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1631}
1632
1633/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1634/// \a __V and returns the 64-bit values in the corresponding elements of a
1635/// 256-bit vector of [4 x i64].
1636///
1637/// \code{.operation}
1638/// result[63:0] := ZeroExtend(__V[31:0])
1639/// result[127:64] := ZeroExtend(__V[63:32])
1640/// result[191:128] := ZeroExtend(__V[95:64])
1641/// result[255:192] := ZeroExtend(__V[127:96])
1642/// \endcode
1643///
1644/// \headerfile <immintrin.h>
1645///
1646/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1647///
1648/// \param __V
1649/// A 128-bit vector of [4 x i32] containing the source values.
1650/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1651/// values.
1652static __inline__ __m256i __DEFAULT_FN_ATTRS256
1654{
1655 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1656}
1657
1658/// Multiplies signed 32-bit integers from even-numbered elements of two
1659/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1660/// [4 x i64] result.
1661///
1662/// \code{.operation}
1663/// result[63:0] := __a[31:0] * __b[31:0]
1664/// result[127:64] := __a[95:64] * __b[95:64]
1665/// result[191:128] := __a[159:128] * __b[159:128]
1666/// result[255:192] := __a[223:192] * __b[223:192]
1667/// \endcode
1668///
1669/// \headerfile <immintrin.h>
1670///
1671/// This intrinsic corresponds to the \c VPMULDQ instruction.
1672///
1673/// \param __a
1674/// A 256-bit vector of [8 x i32] containing one of the source operands.
1675/// \param __b
1676/// A 256-bit vector of [8 x i32] containing one of the source operands.
1677/// \returns A 256-bit vector of [4 x i64] containing the products.
1678static __inline__ __m256i __DEFAULT_FN_ATTRS256
1679_mm256_mul_epi32(__m256i __a, __m256i __b)
1680{
1681 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1682}
1683
1684/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1685/// [16 x i16], truncates the 32-bit results to the most significant 18
1686/// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1687/// product in the [16 x i16] result.
1688///
1689/// \code{.operation}
1690/// FOR i := 0 TO 15
1691/// j := i*16
1692/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1693/// result[j+15:j] := temp[16:1]
1694/// \endcode
1695///
1696/// \headerfile <immintrin.h>
1697///
1698/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1699///
1700/// \param __a
1701/// A 256-bit vector of [16 x i16] containing one of the source operands.
1702/// \param __b
1703/// A 256-bit vector of [16 x i16] containing one of the source operands.
1704/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1705static __inline__ __m256i __DEFAULT_FN_ATTRS256
1706_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1707{
1708 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1709}
1710
1711/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1712/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1713/// [16 x i16] result.
1714///
1715/// \headerfile <immintrin.h>
1716///
1717/// This intrinsic corresponds to the \c VPMULHUW instruction.
1718///
1719/// \param __a
1720/// A 256-bit vector of [16 x i16] containing one of the source operands.
1721/// \param __b
1722/// A 256-bit vector of [16 x i16] containing one of the source operands.
1723/// \returns A 256-bit vector of [16 x i16] containing the products.
1724static __inline__ __m256i __DEFAULT_FN_ATTRS256
1725_mm256_mulhi_epu16(__m256i __a, __m256i __b)
1726{
1727 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1728}
1729
1730/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1731/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1732/// [16 x i16] result.
1733///
1734/// \headerfile <immintrin.h>
1735///
1736/// This intrinsic corresponds to the \c VPMULHW instruction.
1737///
1738/// \param __a
1739/// A 256-bit vector of [16 x i16] containing one of the source operands.
1740/// \param __b
1741/// A 256-bit vector of [16 x i16] containing one of the source operands.
1742/// \returns A 256-bit vector of [16 x i16] containing the products.
1743static __inline__ __m256i __DEFAULT_FN_ATTRS256
1744_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1745{
1746 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1747}
1748
1749/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1750/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1751/// [16 x i16] result.
1752///
1753/// \headerfile <immintrin.h>
1754///
1755/// This intrinsic corresponds to the \c VPMULLW instruction.
1756///
1757/// \param __a
1758/// A 256-bit vector of [16 x i16] containing one of the source operands.
1759/// \param __b
1760/// A 256-bit vector of [16 x i16] containing one of the source operands.
1761/// \returns A 256-bit vector of [16 x i16] containing the products.
1762static __inline__ __m256i __DEFAULT_FN_ATTRS256
1763_mm256_mullo_epi16(__m256i __a, __m256i __b)
1764{
1765 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1766}
1767
1768/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1769/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1770/// [8 x i32] result.
1771///
1772/// \headerfile <immintrin.h>
1773///
1774/// This intrinsic corresponds to the \c VPMULLD instruction.
1775///
1776/// \param __a
1777/// A 256-bit vector of [8 x i32] containing one of the source operands.
1778/// \param __b
1779/// A 256-bit vector of [8 x i32] containing one of the source operands.
1780/// \returns A 256-bit vector of [8 x i32] containing the products.
1781static __inline__ __m256i __DEFAULT_FN_ATTRS256
1782_mm256_mullo_epi32 (__m256i __a, __m256i __b)
1783{
1784 return (__m256i)((__v8su)__a * (__v8su)__b);
1785}
1786
1787/// Multiplies unsigned 32-bit integers from even-numered elements of two
1788/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1789/// [4 x i64] result.
1790///
1791/// \code{.operation}
1792/// result[63:0] := __a[31:0] * __b[31:0]
1793/// result[127:64] := __a[95:64] * __b[95:64]
1794/// result[191:128] := __a[159:128] * __b[159:128]
1795/// result[255:192] := __a[223:192] * __b[223:192]
1796/// \endcode
1797///
1798/// \headerfile <immintrin.h>
1799///
1800/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1801///
1802/// \param __a
1803/// A 256-bit vector of [8 x i32] containing one of the source operands.
1804/// \param __b
1805/// A 256-bit vector of [8 x i32] containing one of the source operands.
1806/// \returns A 256-bit vector of [4 x i64] containing the products.
1807static __inline__ __m256i __DEFAULT_FN_ATTRS256
1808_mm256_mul_epu32(__m256i __a, __m256i __b)
1809{
1810 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1811}
1812
1813/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1814/// \a __b.
1815///
1816/// \headerfile <immintrin.h>
1817///
1818/// This intrinsic corresponds to the \c VPOR instruction.
1819///
1820/// \param __a
1821/// A 256-bit integer vector.
1822/// \param __b
1823/// A 256-bit integer vector.
1824/// \returns A 256-bit integer vector containing the result.
1825static __inline__ __m256i __DEFAULT_FN_ATTRS256
1826_mm256_or_si256(__m256i __a, __m256i __b)
1827{
1828 return (__m256i)((__v4du)__a | (__v4du)__b);
1829}
1830
1831/// Computes four sum of absolute difference (SAD) operations on sets of eight
1832/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1833/// \a __b.
1834///
1835/// One SAD result is computed for each set of eight bytes from \a __a and
1836/// eight bytes from \a __b. The zero-extended SAD value is returned in the
1837/// corresponding 64-bit element of the result.
1838///
1839/// A single SAD operation takes the differences between the corresponding
1840/// bytes of \a __a and \a __b, takes the absolute value of each difference,
1841/// and sums these eight values to form one 16-bit result. This operation
1842/// is repeated four times with successive sets of eight bytes.
1843///
1844/// \code{.operation}
1845/// FOR i := 0 TO 3
1846/// j := i*64
1847/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1848/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1849/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1850/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1851/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1852/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1853/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1854/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1855/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1856/// temp4 + temp5 + temp6 + temp7
1857/// result[j+63:j+16] := 0
1858/// ENDFOR
1859/// \endcode
1860///
1861/// \headerfile <immintrin.h>
1862///
1863/// This intrinsic corresponds to the \c VPSADBW instruction.
1864///
1865/// \param __a
1866/// A 256-bit integer vector.
1867/// \param __b
1868/// A 256-bit integer vector.
1869/// \returns A 256-bit integer vector containing the result.
1870static __inline__ __m256i __DEFAULT_FN_ATTRS256
1871_mm256_sad_epu8(__m256i __a, __m256i __b)
1872{
1873 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1874}
1875
1876/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1877/// to control information in the 256-bit integer vector \a __b, and
1878/// returns the 256-bit result. In effect there are two separate 128-bit
1879/// shuffles in the lower and upper halves.
1880///
1881/// \code{.operation}
1882/// FOR i := 0 TO 31
1883/// j := i*8
1884/// IF __b[j+7] == 1
1885/// result[j+7:j] := 0
1886/// ELSE
1887/// k := __b[j+3:j] * 8
1888/// IF i > 15
1889/// k := k + 128
1890/// FI
1891/// result[j+7:j] := __a[k+7:k]
1892/// FI
1893/// ENDFOR
1894/// \endcode
1895///
1896/// \headerfile <immintrin.h>
1897///
1898/// This intrinsic corresponds to the \c VPSHUFB instruction.
1899///
1900/// \param __a
1901/// A 256-bit integer vector containing source values.
1902/// \param __b
1903/// A 256-bit integer vector containing control information to determine
1904/// what goes into the corresponding byte of the result. If bit 7 of the
1905/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1906/// control byte specify the index (within the same 128-bit half) of \a __a
1907/// to copy to the result byte.
1908/// \returns A 256-bit integer vector containing the result.
1909static __inline__ __m256i __DEFAULT_FN_ATTRS256
1910_mm256_shuffle_epi8(__m256i __a, __m256i __b)
1911{
1912 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1913}
1914
1915/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1916/// according to control information in the integer literal \a imm, and
1917/// returns the 256-bit result. In effect there are two parallel 128-bit
1918/// shuffles in the lower and upper halves.
1919///
1920/// \code{.operation}
1921/// FOR i := 0 to 3
1922/// j := i*32
1923/// k := (imm >> i*2)[1:0] * 32
1924/// result[j+31:j] := a[k+31:k]
1925/// result[128+j+31:128+j] := a[128+k+31:128+k]
1926/// ENDFOR
1927/// \endcode
1928///
1929/// \headerfile <immintrin.h>
1930///
1931/// \code
1932/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1933/// \endcode
1934///
1935/// This intrinsic corresponds to the \c VPSHUFB instruction.
1936///
1937/// \param a
1938/// A 256-bit vector of [8 x i32] containing source values.
1939/// \param imm
1940/// An immediate 8-bit value specifying which elements to copy from \a a.
1941/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1942/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1943/// forth.
1944/// \returns A 256-bit vector of [8 x i32] containing the result.
1945#define _mm256_shuffle_epi32(a, imm) \
1946 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1947
1948/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1949/// according to control information in the integer literal \a imm, and
1950/// returns the 256-bit result. The upper 64 bits of each 128-bit half
1951/// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1952/// copied from \a a unchanged.
1953///
1954/// \code{.operation}
1955/// result[63:0] := a[63:0]
1956/// result[191:128] := a[191:128]
1957/// FOR i := 0 TO 3
1958/// j := i * 16 + 64
1959/// k := (imm >> i*2)[1:0] * 16 + 64
1960/// result[j+15:j] := a[k+15:k]
1961/// result[128+j+15:128+j] := a[128+k+15:128+k]
1962/// ENDFOR
1963/// \endcode
1964///
1965/// \headerfile <immintrin.h>
1966///
1967/// \code
1968/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1969/// \endcode
1970///
1971/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1972///
1973/// \param a
1974/// A 256-bit vector of [16 x i16] containing source values.
1975/// \param imm
1976/// An immediate 8-bit value specifying which elements to copy from \a a.
1977/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1978/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1979/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1980/// \returns A 256-bit vector of [16 x i16] containing the result.
1981#define _mm256_shufflehi_epi16(a, imm) \
1982 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1983
1984/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1985/// according to control information in the integer literal \a imm, and
1986/// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1987/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1988/// copied from \a a unchanged.
1989///
1990/// \code{.operation}
1991/// result[127:64] := a[127:64]
1992/// result[255:192] := a[255:192]
1993/// FOR i := 0 TO 3
1994/// j := i * 16
1995/// k := (imm >> i*2)[1:0] * 16
1996/// result[j+15:j] := a[k+15:k]
1997/// result[128+j+15:128+j] := a[128+k+15:128+k]
1998/// ENDFOR
1999/// \endcode
2000///
2001/// \headerfile <immintrin.h>
2002///
2003/// \code
2004/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
2005/// \endcode
2006///
2007/// This intrinsic corresponds to the \c VPSHUFLW instruction.
2008///
2009/// \param a
2010/// A 256-bit vector of [16 x i16] to use as a source of data for the
2011/// result.
2012/// \param imm
2013/// An immediate 8-bit value specifying which elements to copy from \a a.
2014/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2015/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2016/// forth.
2017/// \returns A 256-bit vector of [16 x i16] containing the result.
2018#define _mm256_shufflelo_epi16(a, imm) \
2019 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2020
2021/// Sets each byte of the result to the corresponding byte of the 256-bit
2022/// integer vector in \a __a, the negative of that byte, or zero, depending
2023/// on whether the corresponding byte of the 256-bit integer vector in
2024/// \a __b is greater than zero, less than zero, or equal to zero,
2025/// respectively.
2026///
2027/// \headerfile <immintrin.h>
2028///
2029/// This intrinsic corresponds to the \c VPSIGNB instruction.
2030///
2031/// \param __a
2032/// A 256-bit integer vector.
2033/// \param __b
2034/// A 256-bit integer vector].
2035/// \returns A 256-bit integer vector containing the result.
2036static __inline__ __m256i __DEFAULT_FN_ATTRS256
2037_mm256_sign_epi8(__m256i __a, __m256i __b)
2038{
2039 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2040}
2041
2042/// Sets each element of the result to the corresponding element of the
2043/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2044/// or zero, depending on whether the corresponding element of the 256-bit
2045/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2046/// equal to zero, respectively.
2047///
2048/// \headerfile <immintrin.h>
2049///
2050/// This intrinsic corresponds to the \c VPSIGNW instruction.
2051///
2052/// \param __a
2053/// A 256-bit vector of [16 x i16].
2054/// \param __b
2055/// A 256-bit vector of [16 x i16].
2056/// \returns A 256-bit vector of [16 x i16] containing the result.
2057static __inline__ __m256i __DEFAULT_FN_ATTRS256
2058_mm256_sign_epi16(__m256i __a, __m256i __b)
2059{
2060 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2061}
2062
2063/// Sets each element of the result to the corresponding element of the
2064/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2065/// zero, depending on whether the corresponding element of the 256-bit
2066/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2067/// equal to zero, respectively.
2068///
2069/// \headerfile <immintrin.h>
2070///
2071/// This intrinsic corresponds to the \c VPSIGND instruction.
2072///
2073/// \param __a
2074/// A 256-bit vector of [8 x i32].
2075/// \param __b
2076/// A 256-bit vector of [8 x i32].
2077/// \returns A 256-bit vector of [8 x i32] containing the result.
2078static __inline__ __m256i __DEFAULT_FN_ATTRS256
2079_mm256_sign_epi32(__m256i __a, __m256i __b)
2080{
2081 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2082}
2083
2084/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2085/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2086/// is greater than 15, the returned result is all zeroes.
2087///
2088/// \headerfile <immintrin.h>
2089///
2090/// \code
2091/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2092/// \endcode
2093///
2094/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2095///
2096/// \param a
2097/// A 256-bit integer vector to be shifted.
2098/// \param imm
2099/// An unsigned immediate value specifying the shift count (in bytes).
2100/// \returns A 256-bit integer vector containing the result.
2101#define _mm256_slli_si256(a, imm) \
2102 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2103
2104/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2105/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2106/// is greater than 15, the returned result is all zeroes.
2107///
2108/// \headerfile <immintrin.h>
2109///
2110/// \code
2111/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2112/// \endcode
2113///
2114/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2115///
2116/// \param a
2117/// A 256-bit integer vector to be shifted.
2118/// \param imm
2119/// An unsigned immediate value specifying the shift count (in bytes).
2120/// \returns A 256-bit integer vector containing the result.
2121#define _mm256_bslli_epi128(a, imm) \
2122 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2123
2124/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2125/// left by \a __count bits, shifting in zero bits, and returns the result.
2126/// If \a __count is greater than 15, the returned result is all zeroes.
2127///
2128/// \headerfile <immintrin.h>
2129///
2130/// This intrinsic corresponds to the \c VPSLLW instruction.
2131///
2132/// \param __a
2133/// A 256-bit vector of [16 x i16] to be shifted.
2134/// \param __count
2135/// An unsigned integer value specifying the shift count (in bits).
2136/// \returns A 256-bit vector of [16 x i16] containing the result.
2137static __inline__ __m256i __DEFAULT_FN_ATTRS256
2138_mm256_slli_epi16(__m256i __a, int __count)
2139{
2140 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2141}
2142
2143/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2144/// left by the number of bits specified by the lower 64 bits of \a __count,
2145/// shifting in zero bits, and returns the result. If \a __count is greater
2146/// than 15, the returned result is all zeroes.
2147///
2148/// \headerfile <immintrin.h>
2149///
2150/// This intrinsic corresponds to the \c VPSLLW instruction.
2151///
2152/// \param __a
2153/// A 256-bit vector of [16 x i16] to be shifted.
2154/// \param __count
2155/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2156/// shift count (in bits). The upper element is ignored.
2157/// \returns A 256-bit vector of [16 x i16] containing the result.
2158static __inline__ __m256i __DEFAULT_FN_ATTRS256
2159_mm256_sll_epi16(__m256i __a, __m128i __count)
2160{
2161 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2162}
2163
2164/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2165/// left by \a __count bits, shifting in zero bits, and returns the result.
2166/// If \a __count is greater than 31, the returned result is all zeroes.
2167///
2168/// \headerfile <immintrin.h>
2169///
2170/// This intrinsic corresponds to the \c VPSLLD instruction.
2171///
2172/// \param __a
2173/// A 256-bit vector of [8 x i32] to be shifted.
2174/// \param __count
2175/// An unsigned integer value specifying the shift count (in bits).
2176/// \returns A 256-bit vector of [8 x i32] containing the result.
2177static __inline__ __m256i __DEFAULT_FN_ATTRS256
2178_mm256_slli_epi32(__m256i __a, int __count)
2179{
2180 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2181}
2182
2183/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2184/// left by the number of bits given in the lower 64 bits of \a __count,
2185/// shifting in zero bits, and returns the result. If \a __count is greater
2186/// than 31, the returned result is all zeroes.
2187///
2188/// \headerfile <immintrin.h>
2189///
2190/// This intrinsic corresponds to the \c VPSLLD instruction.
2191///
2192/// \param __a
2193/// A 256-bit vector of [8 x i32] to be shifted.
2194/// \param __count
2195/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2196/// shift count (in bits). The upper element is ignored.
2197/// \returns A 256-bit vector of [8 x i32] containing the result.
2198static __inline__ __m256i __DEFAULT_FN_ATTRS256
2199_mm256_sll_epi32(__m256i __a, __m128i __count)
2200{
2201 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2202}
2203
2204/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2205/// left by \a __count bits, shifting in zero bits, and returns the result.
2206/// If \a __count is greater than 63, the returned result is all zeroes.
2207///
2208/// \headerfile <immintrin.h>
2209///
2210/// This intrinsic corresponds to the \c VPSLLQ instruction.
2211///
2212/// \param __a
2213/// A 256-bit vector of [4 x i64] to be shifted.
2214/// \param __count
2215/// An unsigned integer value specifying the shift count (in bits).
2216/// \returns A 256-bit vector of [4 x i64] containing the result.
2217static __inline__ __m256i __DEFAULT_FN_ATTRS256
2218_mm256_slli_epi64(__m256i __a, int __count)
2219{
2220 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2221}
2222
2223/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2224/// left by the number of bits given in the lower 64 bits of \a __count,
2225/// shifting in zero bits, and returns the result. If \a __count is greater
2226/// than 63, the returned result is all zeroes.
2227///
2228/// \headerfile <immintrin.h>
2229///
2230/// This intrinsic corresponds to the \c VPSLLQ instruction.
2231///
2232/// \param __a
2233/// A 256-bit vector of [4 x i64] to be shifted.
2234/// \param __count
2235/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2236/// shift count (in bits). The upper element is ignored.
2237/// \returns A 256-bit vector of [4 x i64] containing the result.
2238static __inline__ __m256i __DEFAULT_FN_ATTRS256
2239_mm256_sll_epi64(__m256i __a, __m128i __count)
2240{
2241 return __builtin_ia32_psllq256((__v4di)__a, __count);
2242}
2243
2244/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2245/// right by \a __count bits, shifting in sign bits, and returns the result.
2246/// If \a __count is greater than 15, each element of the result is either
2247/// 0 or -1 according to the corresponding input sign bit.
2248///
2249/// \headerfile <immintrin.h>
2250///
2251/// This intrinsic corresponds to the \c VPSRAW instruction.
2252///
2253/// \param __a
2254/// A 256-bit vector of [16 x i16] to be shifted.
2255/// \param __count
2256/// An unsigned integer value specifying the shift count (in bits).
2257/// \returns A 256-bit vector of [16 x i16] containing the result.
2258static __inline__ __m256i __DEFAULT_FN_ATTRS256
2259_mm256_srai_epi16(__m256i __a, int __count)
2260{
2261 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2262}
2263
2264/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2265/// right by the number of bits given in the lower 64 bits of \a __count,
2266/// shifting in sign bits, and returns the result. If \a __count is greater
2267/// than 15, each element of the result is either 0 or -1 according to the
2268/// corresponding input sign bit.
2269///
2270/// \headerfile <immintrin.h>
2271///
2272/// This intrinsic corresponds to the \c VPSRAW instruction.
2273///
2274/// \param __a
2275/// A 256-bit vector of [16 x i16] to be shifted.
2276/// \param __count
2277/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2278/// shift count (in bits). The upper element is ignored.
2279/// \returns A 256-bit vector of [16 x i16] containing the result.
2280static __inline__ __m256i __DEFAULT_FN_ATTRS256
2281_mm256_sra_epi16(__m256i __a, __m128i __count)
2282{
2283 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2284}
2285
2286/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2287/// right by \a __count bits, shifting in sign bits, and returns the result.
2288/// If \a __count is greater than 31, each element of the result is either
2289/// 0 or -1 according to the corresponding input sign bit.
2290///
2291/// \headerfile <immintrin.h>
2292///
2293/// This intrinsic corresponds to the \c VPSRAD instruction.
2294///
2295/// \param __a
2296/// A 256-bit vector of [8 x i32] to be shifted.
2297/// \param __count
2298/// An unsigned integer value specifying the shift count (in bits).
2299/// \returns A 256-bit vector of [8 x i32] containing the result.
2300static __inline__ __m256i __DEFAULT_FN_ATTRS256
2301_mm256_srai_epi32(__m256i __a, int __count)
2302{
2303 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2304}
2305
2306/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2307/// right by the number of bits given in the lower 64 bits of \a __count,
2308/// shifting in sign bits, and returns the result. If \a __count is greater
2309/// than 31, each element of the result is either 0 or -1 according to the
2310/// corresponding input sign bit.
2311///
2312/// \headerfile <immintrin.h>
2313///
2314/// This intrinsic corresponds to the \c VPSRAD instruction.
2315///
2316/// \param __a
2317/// A 256-bit vector of [8 x i32] to be shifted.
2318/// \param __count
2319/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2320/// shift count (in bits). The upper element is ignored.
2321/// \returns A 256-bit vector of [8 x i32] containing the result.
2322static __inline__ __m256i __DEFAULT_FN_ATTRS256
2323_mm256_sra_epi32(__m256i __a, __m128i __count)
2324{
2325 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2326}
2327
2328/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2329/// \a imm bytes, shifting in zero bytes, and returns the result. If
2330/// \a imm is greater than 15, the returned result is all zeroes.
2331///
2332/// \headerfile <immintrin.h>
2333///
2334/// \code
2335/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2336/// \endcode
2337///
2338/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2339///
2340/// \param a
2341/// A 256-bit integer vector to be shifted.
2342/// \param imm
2343/// An unsigned immediate value specifying the shift count (in bytes).
2344/// \returns A 256-bit integer vector containing the result.
2345#define _mm256_srli_si256(a, imm) \
2346 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2347
2348/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2349/// \a imm bytes, shifting in zero bytes, and returns the result. If
2350/// \a imm is greater than 15, the returned result is all zeroes.
2351///
2352/// \headerfile <immintrin.h>
2353///
2354/// \code
2355/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2356/// \endcode
2357///
2358/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2359///
2360/// \param a
2361/// A 256-bit integer vector to be shifted.
2362/// \param imm
2363/// An unsigned immediate value specifying the shift count (in bytes).
2364/// \returns A 256-bit integer vector containing the result.
2365#define _mm256_bsrli_epi128(a, imm) \
2366 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2367
2368/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2369/// right by \a __count bits, shifting in zero bits, and returns the result.
2370/// If \a __count is greater than 15, the returned result is all zeroes.
2371///
2372/// \headerfile <immintrin.h>
2373///
2374/// This intrinsic corresponds to the \c VPSRLW instruction.
2375///
2376/// \param __a
2377/// A 256-bit vector of [16 x i16] to be shifted.
2378/// \param __count
2379/// An unsigned integer value specifying the shift count (in bits).
2380/// \returns A 256-bit vector of [16 x i16] containing the result.
2381static __inline__ __m256i __DEFAULT_FN_ATTRS256
2382_mm256_srli_epi16(__m256i __a, int __count)
2383{
2384 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2385}
2386
2387/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2388/// right by the number of bits given in the lower 64 bits of \a __count,
2389/// shifting in zero bits, and returns the result. If \a __count is greater
2390/// than 15, the returned result is all zeroes.
2391///
2392/// \headerfile <immintrin.h>
2393///
2394/// This intrinsic corresponds to the \c VPSRLW instruction.
2395///
2396/// \param __a
2397/// A 256-bit vector of [16 x i16] to be shifted.
2398/// \param __count
2399/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2400/// shift count (in bits). The upper element is ignored.
2401/// \returns A 256-bit vector of [16 x i16] containing the result.
2402static __inline__ __m256i __DEFAULT_FN_ATTRS256
2403_mm256_srl_epi16(__m256i __a, __m128i __count)
2404{
2405 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2406}
2407
2408/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2409/// right by \a __count bits, shifting in zero bits, and returns the result.
2410/// If \a __count is greater than 31, the returned result is all zeroes.
2411///
2412/// \headerfile <immintrin.h>
2413///
2414/// This intrinsic corresponds to the \c VPSRLD instruction.
2415///
2416/// \param __a
2417/// A 256-bit vector of [8 x i32] to be shifted.
2418/// \param __count
2419/// An unsigned integer value specifying the shift count (in bits).
2420/// \returns A 256-bit vector of [8 x i32] containing the result.
2421static __inline__ __m256i __DEFAULT_FN_ATTRS256
2422_mm256_srli_epi32(__m256i __a, int __count)
2423{
2424 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2425}
2426
2427/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2428/// right by the number of bits given in the lower 64 bits of \a __count,
2429/// shifting in zero bits, and returns the result. If \a __count is greater
2430/// than 31, the returned result is all zeroes.
2431///
2432/// \headerfile <immintrin.h>
2433///
2434/// This intrinsic corresponds to the \c VPSRLD instruction.
2435///
2436/// \param __a
2437/// A 256-bit vector of [8 x i32] to be shifted.
2438/// \param __count
2439/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2440/// shift count (in bits). The upper element is ignored.
2441/// \returns A 256-bit vector of [8 x i32] containing the result.
2442static __inline__ __m256i __DEFAULT_FN_ATTRS256
2443_mm256_srl_epi32(__m256i __a, __m128i __count)
2444{
2445 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2446}
2447
2448/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2449/// right by \a __count bits, shifting in zero bits, and returns the result.
2450/// If \a __count is greater than 63, the returned result is all zeroes.
2451///
2452/// \headerfile <immintrin.h>
2453///
2454/// This intrinsic corresponds to the \c VPSRLQ instruction.
2455///
2456/// \param __a
2457/// A 256-bit vector of [4 x i64] to be shifted.
2458/// \param __count
2459/// An unsigned integer value specifying the shift count (in bits).
2460/// \returns A 256-bit vector of [4 x i64] containing the result.
2461static __inline__ __m256i __DEFAULT_FN_ATTRS256
2462_mm256_srli_epi64(__m256i __a, int __count)
2463{
2464 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2465}
2466
2467/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2468/// right by the number of bits given in the lower 64 bits of \a __count,
2469/// shifting in zero bits, and returns the result. If \a __count is greater
2470/// than 63, the returned result is all zeroes.
2471///
2472/// \headerfile <immintrin.h>
2473///
2474/// This intrinsic corresponds to the \c VPSRLQ instruction.
2475///
2476/// \param __a
2477/// A 256-bit vector of [4 x i64] to be shifted.
2478/// \param __count
2479/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2480/// shift count (in bits). The upper element is ignored.
2481/// \returns A 256-bit vector of [4 x i64] containing the result.
2482static __inline__ __m256i __DEFAULT_FN_ATTRS256
2483_mm256_srl_epi64(__m256i __a, __m128i __count)
2484{
2485 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2486}
2487
2488/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2489/// vectors. Returns the lower 8 bits of each difference in the
2490/// corresponding byte of the 256-bit integer vector result (overflow is
2491/// ignored).
2492///
2493/// \code{.operation}
2494/// FOR i := 0 TO 31
2495/// j := i*8
2496/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2497/// ENDFOR
2498/// \endcode
2499///
2500/// \headerfile <immintrin.h>
2501///
2502/// This intrinsic corresponds to the \c VPSUBB instruction.
2503///
2504/// \param __a
2505/// A 256-bit integer vector containing the minuends.
2506/// \param __b
2507/// A 256-bit integer vector containing the subtrahends.
2508/// \returns A 256-bit integer vector containing the differences.
2509static __inline__ __m256i __DEFAULT_FN_ATTRS256
2510_mm256_sub_epi8(__m256i __a, __m256i __b)
2511{
2512 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2513}
2514
2515/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2516/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2517/// the corresponding element of the [16 x i16] result (overflow is
2518/// ignored).
2519///
2520/// \code{.operation}
2521/// FOR i := 0 TO 15
2522/// j := i*16
2523/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2524/// ENDFOR
2525/// \endcode
2526///
2527/// \headerfile <immintrin.h>
2528///
2529/// This intrinsic corresponds to the \c VPSUBW instruction.
2530///
2531/// \param __a
2532/// A 256-bit vector of [16 x i16] containing the minuends.
2533/// \param __b
2534/// A 256-bit vector of [16 x i16] containing the subtrahends.
2535/// \returns A 256-bit vector of [16 x i16] containing the differences.
2536static __inline__ __m256i __DEFAULT_FN_ATTRS256
2537_mm256_sub_epi16(__m256i __a, __m256i __b)
2538{
2539 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2540}
2541
2542/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2543/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2544/// the corresponding element of the [8 x i32] result (overflow is ignored).
2545///
2546/// \code{.operation}
2547/// FOR i := 0 TO 7
2548/// j := i*32
2549/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2550/// ENDFOR
2551/// \endcode
2552///
2553/// \headerfile <immintrin.h>
2554///
2555/// This intrinsic corresponds to the \c VPSUBD instruction.
2556///
2557/// \param __a
2558/// A 256-bit vector of [8 x i32] containing the minuends.
2559/// \param __b
2560/// A 256-bit vector of [8 x i32] containing the subtrahends.
2561/// \returns A 256-bit vector of [8 x i32] containing the differences.
2562static __inline__ __m256i __DEFAULT_FN_ATTRS256
2563_mm256_sub_epi32(__m256i __a, __m256i __b)
2564{
2565 return (__m256i)((__v8su)__a - (__v8su)__b);
2566}
2567
2568/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2569/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2570/// the corresponding element of the [4 x i64] result (overflow is ignored).
2571///
2572/// \code{.operation}
2573/// FOR i := 0 TO 3
2574/// j := i*64
2575/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2576/// ENDFOR
2577/// \endcode
2578///
2579/// \headerfile <immintrin.h>
2580///
2581/// This intrinsic corresponds to the \c VPSUBQ instruction.
2582///
2583/// \param __a
2584/// A 256-bit vector of [4 x i64] containing the minuends.
2585/// \param __b
2586/// A 256-bit vector of [4 x i64] containing the subtrahends.
2587/// \returns A 256-bit vector of [4 x i64] containing the differences.
2588static __inline__ __m256i __DEFAULT_FN_ATTRS256
2589_mm256_sub_epi64(__m256i __a, __m256i __b)
2590{
2591 return (__m256i)((__v4du)__a - (__v4du)__b);
2592}
2593
2594/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2595/// vectors using signed saturation, and returns each differences in the
2596/// corresponding byte of the 256-bit integer vector result.
2597///
2598/// \code{.operation}
2599/// FOR i := 0 TO 31
2600/// j := i*8
2601/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2602/// ENDFOR
2603/// \endcode
2604///
2605/// \headerfile <immintrin.h>
2606///
2607/// This intrinsic corresponds to the \c VPSUBSB instruction.
2608///
2609/// \param __a
2610/// A 256-bit integer vector containing the minuends.
2611/// \param __b
2612/// A 256-bit integer vector containing the subtrahends.
2613/// \returns A 256-bit integer vector containing the differences.
2614static __inline__ __m256i __DEFAULT_FN_ATTRS256
2615_mm256_subs_epi8(__m256i __a, __m256i __b)
2616{
2617 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2618}
2619
2620/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2621/// vectors of [16 x i16] using signed saturation, and returns each
2622/// difference in the corresponding element of the [16 x i16] result.
2623///
2624/// \code{.operation}
2625/// FOR i := 0 TO 15
2626/// j := i*16
2627/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2628/// ENDFOR
2629/// \endcode
2630///
2631/// \headerfile <immintrin.h>
2632///
2633/// This intrinsic corresponds to the \c VPSUBSW instruction.
2634///
2635/// \param __a
2636/// A 256-bit vector of [16 x i16] containing the minuends.
2637/// \param __b
2638/// A 256-bit vector of [16 x i16] containing the subtrahends.
2639/// \returns A 256-bit vector of [16 x i16] containing the differences.
2640static __inline__ __m256i __DEFAULT_FN_ATTRS256
2641_mm256_subs_epi16(__m256i __a, __m256i __b)
2642{
2643 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2644}
2645
2646/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2647/// vectors using unsigned saturation, and returns each difference in the
2648/// corresponding byte of the 256-bit integer vector result. For each byte,
2649/// computes <c> result = __a - __b </c>.
2650///
2651/// \code{.operation}
2652/// FOR i := 0 TO 31
2653/// j := i*8
2654/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2655/// ENDFOR
2656/// \endcode
2657///
2658/// \headerfile <immintrin.h>
2659///
2660/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2661///
2662/// \param __a
2663/// A 256-bit integer vector containing the minuends.
2664/// \param __b
2665/// A 256-bit integer vector containing the subtrahends.
2666/// \returns A 256-bit integer vector containing the differences.
2667static __inline__ __m256i __DEFAULT_FN_ATTRS256
2668_mm256_subs_epu8(__m256i __a, __m256i __b)
2669{
2670 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2671}
2672
2673/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2674/// vectors of [16 x i16] using unsigned saturation, and returns each
2675/// difference in the corresponding element of the [16 x i16] result.
2676///
2677/// \code{.operation}
2678/// FOR i := 0 TO 15
2679/// j := i*16
2680/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2681/// ENDFOR
2682/// \endcode
2683///
2684/// \headerfile <immintrin.h>
2685///
2686/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2687///
2688/// \param __a
2689/// A 256-bit vector of [16 x i16] containing the minuends.
2690/// \param __b
2691/// A 256-bit vector of [16 x i16] containing the subtrahends.
2692/// \returns A 256-bit vector of [16 x i16] containing the differences.
2693static __inline__ __m256i __DEFAULT_FN_ATTRS256
2694_mm256_subs_epu16(__m256i __a, __m256i __b)
2695{
2696 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2697}
2698
2699/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2700/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2701/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2702/// input; other bits in these parameters are ignored.
2703///
2704/// \code{.operation}
2705/// result[7:0] := __a[71:64]
2706/// result[15:8] := __b[71:64]
2707/// result[23:16] := __a[79:72]
2708/// result[31:24] := __b[79:72]
2709/// . . .
2710/// result[127:120] := __b[127:120]
2711/// result[135:128] := __a[199:192]
2712/// . . .
2713/// result[255:248] := __b[255:248]
2714/// \endcode
2715///
2716/// \headerfile <immintrin.h>
2717///
2718/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2719///
2720/// \param __a
2721/// A 256-bit integer vector used as the source for the even-numbered bytes
2722/// of the result.
2723/// \param __b
2724/// A 256-bit integer vector used as the source for the odd-numbered bytes
2725/// of the result.
2726/// \returns A 256-bit integer vector containing the result.
2727static __inline__ __m256i __DEFAULT_FN_ATTRS256
2729{
2730 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2731}
2732
2733/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2734/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2735/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2736/// 128-bit half of \a __a and \a __b as input; other bits in these
2737/// parameters are ignored.
2738///
2739/// \code{.operation}
2740/// result[15:0] := __a[79:64]
2741/// result[31:16] := __b[79:64]
2742/// result[47:32] := __a[95:80]
2743/// result[63:48] := __b[95:80]
2744/// . . .
2745/// result[127:112] := __b[127:112]
2746/// result[143:128] := __a[211:196]
2747/// . . .
2748/// result[255:240] := __b[255:240]
2749/// \endcode
2750///
2751/// \headerfile <immintrin.h>
2752///
2753/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2754///
2755/// \param __a
2756/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2757/// elements of the result.
2758/// \param __b
2759/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2760/// elements of the result.
2761/// \returns A 256-bit vector of [16 x i16] containing the result.
2762static __inline__ __m256i __DEFAULT_FN_ATTRS256
2764{
2765 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2766}
2767
2768/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2769/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2770/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2771/// of \a __a and \a __b as input; other bits in these parameters are
2772/// ignored.
2773///
2774/// \code{.operation}
2775/// result[31:0] := __a[95:64]
2776/// result[63:32] := __b[95:64]
2777/// result[95:64] := __a[127:96]
2778/// result[127:96] := __b[127:96]
2779/// result[159:128] := __a[223:192]
2780/// result[191:160] := __b[223:192]
2781/// result[223:192] := __a[255:224]
2782/// result[255:224] := __b[255:224]
2783/// \endcode
2784///
2785/// \headerfile <immintrin.h>
2786///
2787/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2788///
2789/// \param __a
2790/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2791/// elements of the result.
2792/// \param __b
2793/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2794/// elements of the result.
2795/// \returns A 256-bit vector of [8 x i32] containing the result.
2796static __inline__ __m256i __DEFAULT_FN_ATTRS256
2798{
2799 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2800}
2801
2802/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2803/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2804/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2805/// of \a __a and \a __b as input; other bits in these parameters are
2806/// ignored.
2807///
2808/// \code{.operation}
2809/// result[63:0] := __a[127:64]
2810/// result[127:64] := __b[127:64]
2811/// result[191:128] := __a[255:192]
2812/// result[255:192] := __b[255:192]
2813/// \endcode
2814///
2815/// \headerfile <immintrin.h>
2816///
2817/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2818///
2819/// \param __a
2820/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2821/// elements of the result.
2822/// \param __b
2823/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2824/// elements of the result.
2825/// \returns A 256-bit vector of [4 x i64] containing the result.
2826static __inline__ __m256i __DEFAULT_FN_ATTRS256
2828{
2829 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2830}
2831
2832/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2833/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2834/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2835/// input; other bits in these parameters are ignored.
2836///
2837/// \code{.operation}
2838/// result[7:0] := __a[7:0]
2839/// result[15:8] := __b[7:0]
2840/// result[23:16] := __a[15:8]
2841/// result[31:24] := __b[15:8]
2842/// . . .
2843/// result[127:120] := __b[63:56]
2844/// result[135:128] := __a[135:128]
2845/// . . .
2846/// result[255:248] := __b[191:184]
2847/// \endcode
2848///
2849/// \headerfile <immintrin.h>
2850///
2851/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2852///
2853/// \param __a
2854/// A 256-bit integer vector used as the source for the even-numbered bytes
2855/// of the result.
2856/// \param __b
2857/// A 256-bit integer vector used as the source for the odd-numbered bytes
2858/// of the result.
2859/// \returns A 256-bit integer vector containing the result.
2860static __inline__ __m256i __DEFAULT_FN_ATTRS256
2862{
2863 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2864}
2865
2866/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2867/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2868/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2869/// 128-bit half of \a __a and \a __b as input; other bits in these
2870/// parameters are ignored.
2871///
2872/// \code{.operation}
2873/// result[15:0] := __a[15:0]
2874/// result[31:16] := __b[15:0]
2875/// result[47:32] := __a[31:16]
2876/// result[63:48] := __b[31:16]
2877/// . . .
2878/// result[127:112] := __b[63:48]
2879/// result[143:128] := __a[143:128]
2880/// . . .
2881/// result[255:239] := __b[191:176]
2882/// \endcode
2883///
2884/// \headerfile <immintrin.h>
2885///
2886/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2887///
2888/// \param __a
2889/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2890/// elements of the result.
2891/// \param __b
2892/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2893/// elements of the result.
2894/// \returns A 256-bit vector of [16 x i16] containing the result.
2895static __inline__ __m256i __DEFAULT_FN_ATTRS256
2897{
2898 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2899}
2900
2901/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2902/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2903/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2904/// of \a __a and \a __b as input; other bits in these parameters are
2905/// ignored.
2906///
2907/// \code{.operation}
2908/// result[31:0] := __a[31:0]
2909/// result[63:32] := __b[31:0]
2910/// result[95:64] := __a[63:32]
2911/// result[127:96] := __b[63:32]
2912/// result[159:128] := __a[159:128]
2913/// result[191:160] := __b[159:128]
2914/// result[223:192] := __a[191:160]
2915/// result[255:224] := __b[191:190]
2916/// \endcode
2917///
2918/// \headerfile <immintrin.h>
2919///
2920/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2921///
2922/// \param __a
2923/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2924/// elements of the result.
2925/// \param __b
2926/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2927/// elements of the result.
2928/// \returns A 256-bit vector of [8 x i32] containing the result.
2929static __inline__ __m256i __DEFAULT_FN_ATTRS256
2931{
2932 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2933}
2934
2935/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2936/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2937/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2938/// of \a __a and \a __b as input; other bits in these parameters are
2939/// ignored.
2940///
2941/// \code{.operation}
2942/// result[63:0] := __a[63:0]
2943/// result[127:64] := __b[63:0]
2944/// result[191:128] := __a[191:128]
2945/// result[255:192] := __b[191:128]
2946/// \endcode
2947///
2948/// \headerfile <immintrin.h>
2949///
2950/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2951///
2952/// \param __a
2953/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2954/// elements of the result.
2955/// \param __b
2956/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2957/// elements of the result.
2958/// \returns A 256-bit vector of [4 x i64] containing the result.
2959static __inline__ __m256i __DEFAULT_FN_ATTRS256
2961{
2962 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2963}
2964
2965/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2966/// \a __b.
2967///
2968/// \headerfile <immintrin.h>
2969///
2970/// This intrinsic corresponds to the \c VPXOR instruction.
2971///
2972/// \param __a
2973/// A 256-bit integer vector.
2974/// \param __b
2975/// A 256-bit integer vector.
2976/// \returns A 256-bit integer vector containing the result.
2977static __inline__ __m256i __DEFAULT_FN_ATTRS256
2978_mm256_xor_si256(__m256i __a, __m256i __b)
2979{
2980 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2981}
2982
2983/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2984/// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2985/// boundary.
2986///
2987/// \headerfile <immintrin.h>
2988///
2989/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2990///
2991/// \param __V
2992/// A pointer to the 32-byte aligned memory containing the vector to load.
2993/// \returns A 256-bit integer vector loaded from memory.
2994static __inline__ __m256i __DEFAULT_FN_ATTRS256
2996{
2997 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2998 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2999}
3000
3001/// Broadcasts the 32-bit floating-point value from the low element of the
3002/// 128-bit vector of [4 x float] in \a __X to all elements of the result's
3003/// 128-bit vector of [4 x float].
3004///
3005/// \headerfile <immintrin.h>
3006///
3007/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3008///
3009/// \param __X
3010/// A 128-bit vector of [4 x float] whose low element will be broadcast.
3011/// \returns A 128-bit vector of [4 x float] containing the result.
3012static __inline__ __m128 __DEFAULT_FN_ATTRS128
3014{
3015 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3016}
3017
3018/// Broadcasts the 64-bit floating-point value from the low element of the
3019/// 128-bit vector of [2 x double] in \a __a to both elements of the
3020/// result's 128-bit vector of [2 x double].
3021///
3022/// \headerfile <immintrin.h>
3023///
3024/// This intrinsic corresponds to the \c MOVDDUP instruction.
3025///
3026/// \param __a
3027/// A 128-bit vector of [2 x double] whose low element will be broadcast.
3028/// \returns A 128-bit vector of [2 x double] containing the result.
3029static __inline__ __m128d __DEFAULT_FN_ATTRS128
3031{
3032 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3033}
3034
3035/// Broadcasts the 32-bit floating-point value from the low element of the
3036/// 128-bit vector of [4 x float] in \a __X to all elements of the
3037/// result's 256-bit vector of [8 x float].
3038///
3039/// \headerfile <immintrin.h>
3040///
3041/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3042///
3043/// \param __X
3044/// A 128-bit vector of [4 x float] whose low element will be broadcast.
3045/// \returns A 256-bit vector of [8 x float] containing the result.
3046static __inline__ __m256 __DEFAULT_FN_ATTRS256
3048{
3049 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3050}
3051
3052/// Broadcasts the 64-bit floating-point value from the low element of the
3053/// 128-bit vector of [2 x double] in \a __X to all elements of the
3054/// result's 256-bit vector of [4 x double].
3055///
3056/// \headerfile <immintrin.h>
3057///
3058/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3059///
3060/// \param __X
3061/// A 128-bit vector of [2 x double] whose low element will be broadcast.
3062/// \returns A 256-bit vector of [4 x double] containing the result.
3063static __inline__ __m256d __DEFAULT_FN_ATTRS256
3065{
3066 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3067}
3068
3069/// Broadcasts the 128-bit integer data from \a __X to both the lower and
3070/// upper halves of the 256-bit result.
3071///
3072/// \headerfile <immintrin.h>
3073///
3074/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3075///
3076/// \param __X
3077/// A 128-bit integer vector to be broadcast.
3078/// \returns A 256-bit integer vector containing the result.
3079static __inline__ __m256i __DEFAULT_FN_ATTRS256
3081{
3082 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3083}
3084
3085#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3086
3087/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3088/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3089/// as specified by the immediate integer operand \a M.
3090///
3091/// \code{.operation}
3092/// FOR i := 0 TO 3
3093/// j := i*32
3094/// IF M[i] == 0
3095/// result[31+j:j] := V1[31+j:j]
3096/// ELSE
3097/// result[31+j:j] := V2[32+j:j]
3098/// FI
3099/// ENDFOR
3100/// \endcode
3101///
3102/// \headerfile <immintrin.h>
3103///
3104/// \code
3105/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3106/// \endcode
3107///
3108/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3109///
3110/// \param V1
3111/// A 128-bit vector of [4 x i32] containing source values.
3112/// \param V2
3113/// A 128-bit vector of [4 x i32] containing source values.
3114/// \param M
3115/// An immediate 8-bit integer operand, with bits [3:0] specifying the
3116/// source for each element of the result. The position of the mask bit
3117/// corresponds to the index of a copied value. When a mask bit is 0, the
3118/// element is copied from \a V1; otherwise, it is copied from \a V2.
3119/// \returns A 128-bit vector of [4 x i32] containing the result.
3120#define _mm_blend_epi32(V1, V2, M) \
3121 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3122 (__v4si)(__m128i)(V2), (int)(M)))
3123
3124/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3125/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3126/// as specified by the immediate integer operand \a M.
3127///
3128/// \code{.operation}
3129/// FOR i := 0 TO 7
3130/// j := i*32
3131/// IF M[i] == 0
3132/// result[31+j:j] := V1[31+j:j]
3133/// ELSE
3134/// result[31+j:j] := V2[32+j:j]
3135/// FI
3136/// ENDFOR
3137/// \endcode
3138///
3139/// \headerfile <immintrin.h>
3140///
3141/// \code
3142/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3143/// \endcode
3144///
3145/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3146///
3147/// \param V1
3148/// A 256-bit vector of [8 x i32] containing source values.
3149/// \param V2
3150/// A 256-bit vector of [8 x i32] containing source values.
3151/// \param M
3152/// An immediate 8-bit integer operand, with bits [7:0] specifying the
3153/// source for each element of the result. The position of the mask bit
3154/// corresponds to the index of a copied value. When a mask bit is 0, the
3155/// element is copied from \a V1; otherwise, it is is copied from \a V2.
3156/// \returns A 256-bit vector of [8 x i32] containing the result.
3157#define _mm256_blend_epi32(V1, V2, M) \
3158 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3159 (__v8si)(__m256i)(V2), (int)(M)))
3160
3161/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3162/// bytes of the 256-bit result.
3163///
3164/// \headerfile <immintrin.h>
3165///
3166/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3167///
3168/// \param __X
3169/// A 128-bit integer vector whose low byte will be broadcast.
3170/// \returns A 256-bit integer vector containing the result.
3171static __inline__ __m256i __DEFAULT_FN_ATTRS256
3173{
3174 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3175}
3176
3177/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3178/// to all elements of the result's 256-bit vector of [16 x i16].
3179///
3180/// \headerfile <immintrin.h>
3181///
3182/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3183///
3184/// \param __X
3185/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3186/// \returns A 256-bit vector of [16 x i16] containing the result.
3187static __inline__ __m256i __DEFAULT_FN_ATTRS256
3189{
3190 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3191}
3192
3193/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3194/// to all elements of the result's 256-bit vector of [8 x i32].
3195///
3196/// \headerfile <immintrin.h>
3197///
3198/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3199///
3200/// \param __X
3201/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3202/// \returns A 256-bit vector of [8 x i32] containing the result.
3203static __inline__ __m256i __DEFAULT_FN_ATTRS256
3205{
3206 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3207}
3208
3209/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3210/// to all elements of the result's 256-bit vector of [4 x i64].
3211///
3212/// \headerfile <immintrin.h>
3213///
3214/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3215///
3216/// \param __X
3217/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3218/// \returns A 256-bit vector of [4 x i64] containing the result.
3219static __inline__ __m256i __DEFAULT_FN_ATTRS256
3221{
3222 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3223}
3224
3225/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3226/// bytes of the 128-bit result.
3227///
3228/// \headerfile <immintrin.h>
3229///
3230/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3231///
3232/// \param __X
3233/// A 128-bit integer vector whose low byte will be broadcast.
3234/// \returns A 128-bit integer vector containing the result.
3235static __inline__ __m128i __DEFAULT_FN_ATTRS128
3237{
3238 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3239}
3240
3241/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3242/// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3243///
3244/// \headerfile <immintrin.h>
3245///
3246/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3247///
3248/// \param __X
3249/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3250/// \returns A 128-bit vector of [8 x i16] containing the result.
3251static __inline__ __m128i __DEFAULT_FN_ATTRS128
3253{
3254 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3255}
3256
3257/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3258/// to all elements of the result's vector of [4 x i32].
3259///
3260/// \headerfile <immintrin.h>
3261///
3262/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3263///
3264/// \param __X
3265/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3266/// \returns A 128-bit vector of [4 x i32] containing the result.
3267static __inline__ __m128i __DEFAULT_FN_ATTRS128
3269{
3270 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3271}
3272
3273/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3274/// to both elements of the result's 128-bit vector of [2 x i64].
3275///
3276/// \headerfile <immintrin.h>
3277///
3278/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3279///
3280/// \param __X
3281/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3282/// \returns A 128-bit vector of [2 x i64] containing the result.
3283static __inline__ __m128i __DEFAULT_FN_ATTRS128
3285{
3286 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3287}
3288
3289/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3290/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3291/// elements of the 256-bit vector of [8 x i32] in \a __b.
3292///
3293/// \code{.operation}
3294/// FOR i := 0 TO 7
3295/// j := i*32
3296/// k := __b[j+2:j] * 32
3297/// result[j+31:j] := __a[k+31:k]
3298/// ENDFOR
3299/// \endcode
3300///
3301/// \headerfile <immintrin.h>
3302///
3303/// This intrinsic corresponds to the \c VPERMD instruction.
3304///
3305/// \param __a
3306/// A 256-bit vector of [8 x i32] containing the source values.
3307/// \param __b
3308/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3309/// \a __a.
3310/// \returns A 256-bit vector of [8 x i32] containing the result.
3311static __inline__ __m256i __DEFAULT_FN_ATTRS256
3313{
3314 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3315}
3316
3317/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3318/// the 256-bit vector of [4 x double] in \a V as specified by the
3319/// immediate value \a M.
3320///
3321/// \code{.operation}
3322/// FOR i := 0 TO 3
3323/// j := i*64
3324/// k := (M >> i*2)[1:0] * 64
3325/// result[j+63:j] := V[k+63:k]
3326/// ENDFOR
3327/// \endcode
3328///
3329/// \headerfile <immintrin.h>
3330///
3331/// \code
3332/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3333/// \endcode
3334///
3335/// This intrinsic corresponds to the \c VPERMPD instruction.
3336///
3337/// \param V
3338/// A 256-bit vector of [4 x double] containing the source values.
3339/// \param M
3340/// An immediate 8-bit value specifying which elements to copy from \a V.
3341/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3342/// \a M[3:2] specifies the index for element 1, and so forth.
3343/// \returns A 256-bit vector of [4 x double] containing the result.
3344#define _mm256_permute4x64_pd(V, M) \
3345 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3346
3347/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3348/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3349/// the elements of the 256-bit vector of [8 x i32] in \a __b.
3350///
3351/// \code{.operation}
3352/// FOR i := 0 TO 7
3353/// j := i*32
3354/// k := __b[j+2:j] * 32
3355/// result[j+31:j] := __a[k+31:k]
3356/// ENDFOR
3357/// \endcode
3358///
3359/// \headerfile <immintrin.h>
3360///
3361/// This intrinsic corresponds to the \c VPERMPS instruction.
3362///
3363/// \param __a
3364/// A 256-bit vector of [8 x float] containing the source values.
3365/// \param __b
3366/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3367/// \a __a.
3368/// \returns A 256-bit vector of [8 x float] containing the result.
3369static __inline__ __m256 __DEFAULT_FN_ATTRS256
3371{
3372 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3373}
3374
3375/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3376/// of the 256-bit vector of [4 x i64] in \a V as specified by the
3377/// immediate value \a M.
3378///
3379/// \code{.operation}
3380/// FOR i := 0 TO 3
3381/// j := i*64
3382/// k := (M >> i*2)[1:0] * 64
3383/// result[j+63:j] := V[k+63:k]
3384/// ENDFOR
3385/// \endcode
3386///
3387/// \headerfile <immintrin.h>
3388///
3389/// \code
3390/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3391/// \endcode
3392///
3393/// This intrinsic corresponds to the \c VPERMQ instruction.
3394///
3395/// \param V
3396/// A 256-bit vector of [4 x i64] containing the source values.
3397/// \param M
3398/// An immediate 8-bit value specifying which elements to copy from \a V.
3399/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3400/// \a M[3:2] specifies the index for element 1, and so forth.
3401/// \returns A 256-bit vector of [4 x i64] containing the result.
3402#define _mm256_permute4x64_epi64(V, M) \
3403 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3404
3405/// Sets each half of the 256-bit result either to zero or to one of the
3406/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3407/// as specified by the immediate value \a M.
3408///
3409/// \code{.operation}
3410/// FOR i := 0 TO 1
3411/// j := i*128
3412/// k := M >> (i*4)
3413/// IF k[3] == 0
3414/// CASE (k[1:0]) OF
3415/// 0: result[127+j:j] := V1[127:0]
3416/// 1: result[127+j:j] := V1[255:128]
3417/// 2: result[127+j:j] := V2[127:0]
3418/// 3: result[127+j:j] := V2[255:128]
3419/// ESAC
3420/// ELSE
3421/// result[127+j:j] := 0
3422/// FI
3423/// ENDFOR
3424/// \endcode
3425///
3426/// \headerfile <immintrin.h>
3427///
3428/// \code
3429/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3430/// \endcode
3431///
3432/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3433///
3434/// \param V1
3435/// A 256-bit integer vector containing source values.
3436/// \param V2
3437/// A 256-bit integer vector containing source values.
3438/// \param M
3439/// An immediate value specifying how to form the result. Bits [3:0]
3440/// control the lower half of the result, bits [7:4] control the upper half.
3441/// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3442/// otherwise bits [1:0] determine the source as follows. \n
3443/// 0: the lower half of \a V1 \n
3444/// 1: the upper half of \a V1 \n
3445/// 2: the lower half of \a V2 \n
3446/// 3: the upper half of \a V2
3447/// \returns A 256-bit integer vector containing the result.
3448#define _mm256_permute2x128_si256(V1, V2, M) \
3449 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3450
3451/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3452/// of the immediate \a M is zero, extracts the lower half of the result;
3453/// otherwise, extracts the upper half.
3454///
3455/// \headerfile <immintrin.h>
3456///
3457/// \code
3458/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3459/// \endcode
3460///
3461/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3462///
3463/// \param V
3464/// A 256-bit integer vector containing the source values.
3465/// \param M
3466/// An immediate value specifying which half of \a V to extract.
3467/// \returns A 128-bit integer vector containing the result.
3468#define _mm256_extracti128_si256(V, M) \
3469 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3470
3471/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3472/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3473/// is zero, overwrites the lower half of the result; otherwise,
3474/// overwrites the upper half.
3475///
3476/// \headerfile <immintrin.h>
3477///
3478/// \code
3479/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3480/// \endcode
3481///
3482/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3483///
3484/// \param V1
3485/// A 256-bit integer vector containing a source value.
3486/// \param V2
3487/// A 128-bit integer vector containing a source value.
3488/// \param M
3489/// An immediate value specifying where to put \a V2 in the result.
3490/// \returns A 256-bit integer vector containing the result.
3491#define _mm256_inserti128_si256(V1, V2, M) \
3492 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3493 (__v2di)(__m128i)(V2), (int)(M)))
3494
3495/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3496/// the most significant bit of the corresponding element in the mask
3497/// \a __M is set; otherwise, sets that element of the result to zero.
3498/// Returns the 256-bit [8 x i32] result.
3499///
3500/// \code{.operation}
3501/// FOR i := 0 TO 7
3502/// j := i*32
3503/// IF __M[j+31] == 1
3504/// result[j+31:j] := Load32(__X+(i*4))
3505/// ELSE
3506/// result[j+31:j] := 0
3507/// FI
3508/// ENDFOR
3509/// \endcode
3510///
3511/// \headerfile <immintrin.h>
3512///
3513/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3514///
3515/// \param __X
3516/// A pointer to the memory used for loading values.
3517/// \param __M
3518/// A 256-bit vector of [8 x i32] containing the mask bits.
3519/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3520/// elements.
3521static __inline__ __m256i __DEFAULT_FN_ATTRS256
3522_mm256_maskload_epi32(int const *__X, __m256i __M)
3523{
3524 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3525}
3526
3527/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3528/// the most significant bit of the corresponding element in the mask
3529/// \a __M is set; otherwise, sets that element of the result to zero.
3530/// Returns the 256-bit [4 x i64] result.
3531///
3532/// \code{.operation}
3533/// FOR i := 0 TO 3
3534/// j := i*64
3535/// IF __M[j+63] == 1
3536/// result[j+63:j] := Load64(__X+(i*8))
3537/// ELSE
3538/// result[j+63:j] := 0
3539/// FI
3540/// ENDFOR
3541/// \endcode
3542///
3543/// \headerfile <immintrin.h>
3544///
3545/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3546///
3547/// \param __X
3548/// A pointer to the memory used for loading values.
3549/// \param __M
3550/// A 256-bit vector of [4 x i64] containing the mask bits.
3551/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3552/// elements.
3553static __inline__ __m256i __DEFAULT_FN_ATTRS256
3554_mm256_maskload_epi64(long long const *__X, __m256i __M)
3555{
3556 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3557}
3558
3559/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3560/// the most significant bit of the corresponding element in the mask
3561/// \a __M is set; otherwise, sets that element of the result to zero.
3562/// Returns the 128-bit [4 x i32] result.
3563///
3564/// \code{.operation}
3565/// FOR i := 0 TO 3
3566/// j := i*32
3567/// IF __M[j+31] == 1
3568/// result[j+31:j] := Load32(__X+(i*4))
3569/// ELSE
3570/// result[j+31:j] := 0
3571/// FI
3572/// ENDFOR
3573/// \endcode
3574///
3575/// \headerfile <immintrin.h>
3576///
3577/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3578///
3579/// \param __X
3580/// A pointer to the memory used for loading values.
3581/// \param __M
3582/// A 128-bit vector of [4 x i32] containing the mask bits.
3583/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3584/// elements.
3585static __inline__ __m128i __DEFAULT_FN_ATTRS128
3586_mm_maskload_epi32(int const *__X, __m128i __M)
3587{
3588 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3589}
3590
3591/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3592/// the most significant bit of the corresponding element in the mask
3593/// \a __M is set; otherwise, sets that element of the result to zero.
3594/// Returns the 128-bit [2 x i64] result.
3595///
3596/// \code{.operation}
3597/// FOR i := 0 TO 1
3598/// j := i*64
3599/// IF __M[j+63] == 1
3600/// result[j+63:j] := Load64(__X+(i*8))
3601/// ELSE
3602/// result[j+63:j] := 0
3603/// FI
3604/// ENDFOR
3605/// \endcode
3606///
3607/// \headerfile <immintrin.h>
3608///
3609/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3610///
3611/// \param __X
3612/// A pointer to the memory used for loading values.
3613/// \param __M
3614/// A 128-bit vector of [2 x i64] containing the mask bits.
3615/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3616/// elements.
3617static __inline__ __m128i __DEFAULT_FN_ATTRS128
3618_mm_maskload_epi64(long long const *__X, __m128i __M)
3619{
3620 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3621}
3622
3623/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3624/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3625/// the corresponding element in the mask \a __M is set; otherwise, the
3626/// memory element is unchanged.
3627///
3628/// \code{.operation}
3629/// FOR i := 0 TO 7
3630/// j := i*32
3631/// IF __M[j+31] == 1
3632/// Store32(__X+(i*4), __Y[j+31:j])
3633/// FI
3634/// ENDFOR
3635/// \endcode
3636///
3637/// \headerfile <immintrin.h>
3638///
3639/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3640///
3641/// \param __X
3642/// A pointer to the memory used for storing values.
3643/// \param __M
3644/// A 256-bit vector of [8 x i32] containing the mask bits.
3645/// \param __Y
3646/// A 256-bit vector of [8 x i32] containing the values to store.
3647static __inline__ void __DEFAULT_FN_ATTRS256
3648_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3649{
3650 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3651}
3652
3653/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3654/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3655/// the corresponding element in the mask \a __M is set; otherwise, the
3656/// memory element is unchanged.
3657///
3658/// \code{.operation}
3659/// FOR i := 0 TO 3
3660/// j := i*64
3661/// IF __M[j+63] == 1
3662/// Store64(__X+(i*8), __Y[j+63:j])
3663/// FI
3664/// ENDFOR
3665/// \endcode
3666///
3667/// \headerfile <immintrin.h>
3668///
3669/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3670///
3671/// \param __X
3672/// A pointer to the memory used for storing values.
3673/// \param __M
3674/// A 256-bit vector of [4 x i64] containing the mask bits.
3675/// \param __Y
3676/// A 256-bit vector of [4 x i64] containing the values to store.
3677static __inline__ void __DEFAULT_FN_ATTRS256
3678_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3679{
3680 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3681}
3682
3683/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3684/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3685/// the corresponding element in the mask \a __M is set; otherwise, the
3686/// memory element is unchanged.
3687///
3688/// \code{.operation}
3689/// FOR i := 0 TO 3
3690/// j := i*32
3691/// IF __M[j+31] == 1
3692/// Store32(__X+(i*4), __Y[j+31:j])
3693/// FI
3694/// ENDFOR
3695/// \endcode
3696///
3697/// \headerfile <immintrin.h>
3698///
3699/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3700///
3701/// \param __X
3702/// A pointer to the memory used for storing values.
3703/// \param __M
3704/// A 128-bit vector of [4 x i32] containing the mask bits.
3705/// \param __Y
3706/// A 128-bit vector of [4 x i32] containing the values to store.
3707static __inline__ void __DEFAULT_FN_ATTRS128
3708_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3709{
3710 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3711}
3712
3713/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3714/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3715/// the corresponding element in the mask \a __M is set; otherwise, the
3716/// memory element is unchanged.
3717///
3718/// \code{.operation}
3719/// FOR i := 0 TO 1
3720/// j := i*64
3721/// IF __M[j+63] == 1
3722/// Store64(__X+(i*8), __Y[j+63:j])
3723/// FI
3724/// ENDFOR
3725/// \endcode
3726///
3727/// \headerfile <immintrin.h>
3728///
3729/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3730///
3731/// \param __X
3732/// A pointer to the memory used for storing values.
3733/// \param __M
3734/// A 128-bit vector of [2 x i64] containing the mask bits.
3735/// \param __Y
3736/// A 128-bit vector of [2 x i64] containing the values to store.
3737static __inline__ void __DEFAULT_FN_ATTRS128
3738_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3739{
3740 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3741}
3742
3743/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3744/// left by the number of bits given in the corresponding element of the
3745/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3746/// returns the result. If the shift count for any element is greater than
3747/// 31, the result for that element is zero.
3748///
3749/// \headerfile <immintrin.h>
3750///
3751/// This intrinsic corresponds to the \c VPSLLVD instruction.
3752///
3753/// \param __X
3754/// A 256-bit vector of [8 x i32] to be shifted.
3755/// \param __Y
3756/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3757/// bits).
3758/// \returns A 256-bit vector of [8 x i32] containing the result.
3759static __inline__ __m256i __DEFAULT_FN_ATTRS256
3760_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3761{
3762 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3763}
3764
3765/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3766/// left by the number of bits given in the corresponding element of the
3767/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3768/// returns the result. If the shift count for any element is greater than
3769/// 31, the result for that element is zero.
3770///
3771/// \headerfile <immintrin.h>
3772///
3773/// This intrinsic corresponds to the \c VPSLLVD instruction.
3774///
3775/// \param __X
3776/// A 128-bit vector of [4 x i32] to be shifted.
3777/// \param __Y
3778/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3779/// bits).
3780/// \returns A 128-bit vector of [4 x i32] containing the result.
3781static __inline__ __m128i __DEFAULT_FN_ATTRS128
3782_mm_sllv_epi32(__m128i __X, __m128i __Y)
3783{
3784 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3785}
3786
3787/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3788/// left by the number of bits given in the corresponding element of the
3789/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3790/// returns the result. If the shift count for any element is greater than
3791/// 63, the result for that element is zero.
3792///
3793/// \headerfile <immintrin.h>
3794///
3795/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3796///
3797/// \param __X
3798/// A 256-bit vector of [4 x i64] to be shifted.
3799/// \param __Y
3800/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3801/// bits).
3802/// \returns A 256-bit vector of [4 x i64] containing the result.
3803static __inline__ __m256i __DEFAULT_FN_ATTRS256
3804_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3805{
3806 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3807}
3808
3809/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3810/// left by the number of bits given in the corresponding element of the
3811/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3812/// returns the result. If the shift count for any element is greater than
3813/// 63, the result for that element is zero.
3814///
3815/// \headerfile <immintrin.h>
3816///
3817/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3818///
3819/// \param __X
3820/// A 128-bit vector of [2 x i64] to be shifted.
3821/// \param __Y
3822/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3823/// bits).
3824/// \returns A 128-bit vector of [2 x i64] containing the result.
3825static __inline__ __m128i __DEFAULT_FN_ATTRS128
3826_mm_sllv_epi64(__m128i __X, __m128i __Y)
3827{
3828 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3829}
3830
3831/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3832/// right by the number of bits given in the corresponding element of the
3833/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3834/// returns the result. If the shift count for any element is greater than
3835/// 31, the result for that element is 0 or -1 according to the sign bit
3836/// for that element.
3837///
3838/// \headerfile <immintrin.h>
3839///
3840/// This intrinsic corresponds to the \c VPSRAVD instruction.
3841///
3842/// \param __X
3843/// A 256-bit vector of [8 x i32] to be shifted.
3844/// \param __Y
3845/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3846/// bits).
3847/// \returns A 256-bit vector of [8 x i32] containing the result.
3848static __inline__ __m256i __DEFAULT_FN_ATTRS256
3849_mm256_srav_epi32(__m256i __X, __m256i __Y)
3850{
3851 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3852}
3853
3854/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3855/// right by the number of bits given in the corresponding element of the
3856/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3857/// returns the result. If the shift count for any element is greater than
3858/// 31, the result for that element is 0 or -1 according to the sign bit
3859/// for that element.
3860///
3861/// \headerfile <immintrin.h>
3862///
3863/// This intrinsic corresponds to the \c VPSRAVD instruction.
3864///
3865/// \param __X
3866/// A 128-bit vector of [4 x i32] to be shifted.
3867/// \param __Y
3868/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3869/// bits).
3870/// \returns A 128-bit vector of [4 x i32] containing the result.
3871static __inline__ __m128i __DEFAULT_FN_ATTRS128
3872_mm_srav_epi32(__m128i __X, __m128i __Y)
3873{
3874 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3875}
3876
3877/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3878/// right by the number of bits given in the corresponding element of the
3879/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3880/// returns the result. If the shift count for any element is greater than
3881/// 31, the result for that element is zero.
3882///
3883/// \headerfile <immintrin.h>
3884///
3885/// This intrinsic corresponds to the \c VPSRLVD instruction.
3886///
3887/// \param __X
3888/// A 256-bit vector of [8 x i32] to be shifted.
3889/// \param __Y
3890/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3891/// bits).
3892/// \returns A 256-bit vector of [8 x i32] containing the result.
3893static __inline__ __m256i __DEFAULT_FN_ATTRS256
3894_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3895{
3896 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3897}
3898
3899/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3900/// right by the number of bits given in the corresponding element of the
3901/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3902/// returns the result. If the shift count for any element is greater than
3903/// 31, the result for that element is zero.
3904///
3905/// \headerfile <immintrin.h>
3906///
3907/// This intrinsic corresponds to the \c VPSRLVD instruction.
3908///
3909/// \param __X
3910/// A 128-bit vector of [4 x i32] to be shifted.
3911/// \param __Y
3912/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3913/// bits).
3914/// \returns A 128-bit vector of [4 x i32] containing the result.
3915static __inline__ __m128i __DEFAULT_FN_ATTRS128
3916_mm_srlv_epi32(__m128i __X, __m128i __Y)
3917{
3918 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3919}
3920
3921/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3922/// right by the number of bits given in the corresponding element of the
3923/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3924/// returns the result. If the shift count for any element is greater than
3925/// 63, the result for that element is zero.
3926///
3927/// \headerfile <immintrin.h>
3928///
3929/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3930///
3931/// \param __X
3932/// A 256-bit vector of [4 x i64] to be shifted.
3933/// \param __Y
3934/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3935/// bits).
3936/// \returns A 256-bit vector of [4 x i64] containing the result.
3937static __inline__ __m256i __DEFAULT_FN_ATTRS256
3938_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3939{
3940 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3941}
3942
3943/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3944/// right by the number of bits given in the corresponding element of the
3945/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3946/// returns the result. If the shift count for any element is greater than
3947/// 63, the result for that element is zero.
3948///
3949/// \headerfile <immintrin.h>
3950///
3951/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3952///
3953/// \param __X
3954/// A 128-bit vector of [2 x i64] to be shifted.
3955/// \param __Y
3956/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3957/// bits).
3958/// \returns A 128-bit vector of [2 x i64] containing the result.
3959static __inline__ __m128i __DEFAULT_FN_ATTRS128
3960_mm_srlv_epi64(__m128i __X, __m128i __Y)
3961{
3962 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3963}
3964
3965/// Conditionally gathers two 64-bit floating-point values, either from the
3966/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3967/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3968/// of [2 x double] in \a mask determines the source for each element.
3969///
3970/// \code{.operation}
3971/// FOR element := 0 to 1
3972/// j := element*64
3973/// k := element*32
3974/// IF mask[j+63] == 0
3975/// result[j+63:j] := a[j+63:j]
3976/// ELSE
3977/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3978/// FI
3979/// ENDFOR
3980/// \endcode
3981///
3982/// \headerfile <immintrin.h>
3983///
3984/// \code
3985/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3986/// __m128d mask, const int s);
3987/// \endcode
3988///
3989/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3990///
3991/// \param a
3992/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3993/// zero.
3994/// \param m
3995/// A pointer to the memory used for loading values.
3996/// \param i
3997/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3998/// the first two elements are used.
3999/// \param mask
4000/// A 128-bit vector of [2 x double] containing the mask. The most
4001/// significant bit of each element in the mask vector represents the mask
4002/// bits. If a mask bit is zero, the corresponding value from vector \a a
4003/// is gathered; otherwise the value is loaded from memory.
4004/// \param s
4005/// A literal constant scale factor for the indexes in \a i. Must be
4006/// 1, 2, 4, or 8.
4007/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4008#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4009 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4010 (double const *)(m), \
4011 (__v4si)(__m128i)(i), \
4012 (__v2df)(__m128d)(mask), (s)))
4013
4014/// Conditionally gathers four 64-bit floating-point values, either from the
4015/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4016/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4017/// of [4 x double] in \a mask determines the source for each element.
4018///
4019/// \code{.operation}
4020/// FOR element := 0 to 3
4021/// j := element*64
4022/// k := element*32
4023/// IF mask[j+63] == 0
4024/// result[j+63:j] := a[j+63:j]
4025/// ELSE
4026/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4027/// FI
4028/// ENDFOR
4029/// \endcode
4030///
4031/// \headerfile <immintrin.h>
4032///
4033/// \code
4034/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4035/// __m256d mask, const int s);
4036/// \endcode
4037///
4038/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4039///
4040/// \param a
4041/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4042/// zero.
4043/// \param m
4044/// A pointer to the memory used for loading values.
4045/// \param i
4046/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4047/// \param mask
4048/// A 256-bit vector of [4 x double] containing the mask. The most
4049/// significant bit of each element in the mask vector represents the mask
4050/// bits. If a mask bit is zero, the corresponding value from vector \a a
4051/// is gathered; otherwise the value is loaded from memory.
4052/// \param s
4053/// A literal constant scale factor for the indexes in \a i. Must be
4054/// 1, 2, 4, or 8.
4055/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4056#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4057 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4058 (double const *)(m), \
4059 (__v4si)(__m128i)(i), \
4060 (__v4df)(__m256d)(mask), (s)))
4061
4062/// Conditionally gathers two 64-bit floating-point values, either from the
4063/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4064/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4065/// of [2 x double] in \a mask determines the source for each element.
4066///
4067/// \code{.operation}
4068/// FOR element := 0 to 1
4069/// j := element*64
4070/// k := element*64
4071/// IF mask[j+63] == 0
4072/// result[j+63:j] := a[j+63:j]
4073/// ELSE
4074/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4075/// FI
4076/// ENDFOR
4077/// \endcode
4078///
4079/// \headerfile <immintrin.h>
4080///
4081/// \code
4082/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4083/// __m128d mask, const int s);
4084/// \endcode
4085///
4086/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4087///
4088/// \param a
4089/// A 128-bit vector of [2 x double] used as the source when a mask bit is
4090/// zero.
4091/// \param m
4092/// A pointer to the memory used for loading values.
4093/// \param i
4094/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4095/// \param mask
4096/// A 128-bit vector of [2 x double] containing the mask. The most
4097/// significant bit of each element in the mask vector represents the mask
4098/// bits. If a mask bit is zero, the corresponding value from vector \a a
4099/// is gathered; otherwise the value is loaded from memory.
4100/// \param s
4101/// A literal constant scale factor for the indexes in \a i. Must be
4102/// 1, 2, 4, or 8.
4103/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4104#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4105 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4106 (double const *)(m), \
4107 (__v2di)(__m128i)(i), \
4108 (__v2df)(__m128d)(mask), (s)))
4109
4110/// Conditionally gathers four 64-bit floating-point values, either from the
4111/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4112/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4113/// of [4 x double] in \a mask determines the source for each element.
4114///
4115/// \code{.operation}
4116/// FOR element := 0 to 3
4117/// j := element*64
4118/// k := element*64
4119/// IF mask[j+63] == 0
4120/// result[j+63:j] := a[j+63:j]
4121/// ELSE
4122/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4123/// FI
4124/// ENDFOR
4125/// \endcode
4126///
4127/// \headerfile <immintrin.h>
4128///
4129/// \code
4130/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4131/// __m256d mask, const int s);
4132/// \endcode
4133///
4134/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4135///
4136/// \param a
4137/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4138/// zero.
4139/// \param m
4140/// A pointer to the memory used for loading values.
4141/// \param i
4142/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4143/// \param mask
4144/// A 256-bit vector of [4 x double] containing the mask. The most
4145/// significant bit of each element in the mask vector represents the mask
4146/// bits. If a mask bit is zero, the corresponding value from vector \a a
4147/// is gathered; otherwise the value is loaded from memory.
4148/// \param s
4149/// A literal constant scale factor for the indexes in \a i. Must be
4150/// 1, 2, 4, or 8.
4151/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4152#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4153 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4154 (double const *)(m), \
4155 (__v4di)(__m256i)(i), \
4156 (__v4df)(__m256d)(mask), (s)))
4157
4158/// Conditionally gathers four 32-bit floating-point values, either from the
4159/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4160/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4161/// of [4 x float] in \a mask determines the source for each element.
4162///
4163/// \code{.operation}
4164/// FOR element := 0 to 3
4165/// j := element*32
4166/// k := element*32
4167/// IF mask[j+31] == 0
4168/// result[j+31:j] := a[j+31:j]
4169/// ELSE
4170/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4171/// FI
4172/// ENDFOR
4173/// \endcode
4174///
4175/// \headerfile <immintrin.h>
4176///
4177/// \code
4178/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4179/// __m128 mask, const int s);
4180/// \endcode
4181///
4182/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4183///
4184/// \param a
4185/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4186/// zero.
4187/// \param m
4188/// A pointer to the memory used for loading values.
4189/// \param i
4190/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4191/// \param mask
4192/// A 128-bit vector of [4 x float] containing the mask. The most
4193/// significant bit of each element in the mask vector represents the mask
4194/// bits. If a mask bit is zero, the corresponding value from vector \a a
4195/// is gathered; otherwise the value is loaded from memory.
4196/// \param s
4197/// A literal constant scale factor for the indexes in \a i. Must be
4198/// 1, 2, 4, or 8.
4199/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4200#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4201 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4202 (float const *)(m), \
4203 (__v4si)(__m128i)(i), \
4204 (__v4sf)(__m128)(mask), (s)))
4205
4206/// Conditionally gathers eight 32-bit floating-point values, either from the
4207/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4208/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4209/// of [8 x float] in \a mask determines the source for each element.
4210///
4211/// \code{.operation}
4212/// FOR element := 0 to 7
4213/// j := element*32
4214/// k := element*32
4215/// IF mask[j+31] == 0
4216/// result[j+31:j] := a[j+31:j]
4217/// ELSE
4218/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4219/// FI
4220/// ENDFOR
4221/// \endcode
4222///
4223/// \headerfile <immintrin.h>
4224///
4225/// \code
4226/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4227/// __m256 mask, const int s);
4228/// \endcode
4229///
4230/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4231///
4232/// \param a
4233/// A 256-bit vector of [8 x float] used as the source when a mask bit is
4234/// zero.
4235/// \param m
4236/// A pointer to the memory used for loading values.
4237/// \param i
4238/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4239/// \param mask
4240/// A 256-bit vector of [8 x float] containing the mask. The most
4241/// significant bit of each element in the mask vector represents the mask
4242/// bits. If a mask bit is zero, the corresponding value from vector \a a
4243/// is gathered; otherwise the value is loaded from memory.
4244/// \param s
4245/// A literal constant scale factor for the indexes in \a i. Must be
4246/// 1, 2, 4, or 8.
4247/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4248#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4249 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4250 (float const *)(m), \
4251 (__v8si)(__m256i)(i), \
4252 (__v8sf)(__m256)(mask), (s)))
4253
4254/// Conditionally gathers two 32-bit floating-point values, either from the
4255/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4256/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4257/// of [4 x float] in \a mask determines the source for the lower two
4258/// elements. The upper two elements of the result are zeroed.
4259///
4260/// \code{.operation}
4261/// FOR element := 0 to 1
4262/// j := element*32
4263/// k := element*64
4264/// IF mask[j+31] == 0
4265/// result[j+31:j] := a[j+31:j]
4266/// ELSE
4267/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4268/// FI
4269/// ENDFOR
4270/// result[127:64] := 0
4271/// \endcode
4272///
4273/// \headerfile <immintrin.h>
4274///
4275/// \code
4276/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4277/// __m128 mask, const int s);
4278/// \endcode
4279///
4280/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4281///
4282/// \param a
4283/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4284/// zero. Only the first two elements are used.
4285/// \param m
4286/// A pointer to the memory used for loading values.
4287/// \param i
4288/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4289/// \param mask
4290/// A 128-bit vector of [4 x float] containing the mask. The most
4291/// significant bit of each element in the mask vector represents the mask
4292/// bits. If a mask bit is zero, the corresponding value from vector \a a
4293/// is gathered; otherwise the value is loaded from memory. Only the first
4294/// two elements are used.
4295/// \param s
4296/// A literal constant scale factor for the indexes in \a i. Must be
4297/// 1, 2, 4, or 8.
4298/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4299#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4300 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4301 (float const *)(m), \
4302 (__v2di)(__m128i)(i), \
4303 (__v4sf)(__m128)(mask), (s)))
4304
4305/// Conditionally gathers four 32-bit floating-point values, either from the
4306/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4307/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4308/// of [4 x float] in \a mask determines the source for each element.
4309///
4310/// \code{.operation}
4311/// FOR element := 0 to 3
4312/// j := element*32
4313/// k := element*64
4314/// IF mask[j+31] == 0
4315/// result[j+31:j] := a[j+31:j]
4316/// ELSE
4317/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4318/// FI
4319/// ENDFOR
4320/// \endcode
4321///
4322/// \headerfile <immintrin.h>
4323///
4324/// \code
4325/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4326/// __m128 mask, const int s);
4327/// \endcode
4328///
4329/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4330///
4331/// \param a
4332/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4333/// zero.
4334/// \param m
4335/// A pointer to the memory used for loading values.
4336/// \param i
4337/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4338/// \param mask
4339/// A 128-bit vector of [4 x float] containing the mask. The most
4340/// significant bit of each element in the mask vector represents the mask
4341/// bits. If a mask bit is zero, the corresponding value from vector \a a
4342/// is gathered; otherwise the value is loaded from memory.
4343/// \param s
4344/// A literal constant scale factor for the indexes in \a i. Must be
4345/// 1, 2, 4, or 8.
4346/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4347#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4348 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4349 (float const *)(m), \
4350 (__v4di)(__m256i)(i), \
4351 (__v4sf)(__m128)(mask), (s)))
4352
4353/// Conditionally gathers four 32-bit integer values, either from the
4354/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4355/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4356/// of [4 x i32] in \a mask determines the source for each element.
4357///
4358/// \code{.operation}
4359/// FOR element := 0 to 3
4360/// j := element*32
4361/// k := element*32
4362/// IF mask[j+31] == 0
4363/// result[j+31:j] := a[j+31:j]
4364/// ELSE
4365/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4366/// FI
4367/// ENDFOR
4368/// \endcode
4369///
4370/// \headerfile <immintrin.h>
4371///
4372/// \code
4373/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4374/// __m128i mask, const int s);
4375/// \endcode
4376///
4377/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4378///
4379/// \param a
4380/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4381/// zero.
4382/// \param m
4383/// A pointer to the memory used for loading values.
4384/// \param i
4385/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4386/// \param mask
4387/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4388/// bit of each element in the mask vector represents the mask bits. If a
4389/// mask bit is zero, the corresponding value from vector \a a is gathered;
4390/// otherwise the value is loaded from memory.
4391/// \param s
4392/// A literal constant scale factor for the indexes in \a i. Must be
4393/// 1, 2, 4, or 8.
4394/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4395#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4396 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4397 (int const *)(m), \
4398 (__v4si)(__m128i)(i), \
4399 (__v4si)(__m128i)(mask), (s)))
4400
4401/// Conditionally gathers eight 32-bit integer values, either from the
4402/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4403/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4404/// of [8 x i32] in \a mask determines the source for each element.
4405///
4406/// \code{.operation}
4407/// FOR element := 0 to 7
4408/// j := element*32
4409/// k := element*32
4410/// IF mask[j+31] == 0
4411/// result[j+31:j] := a[j+31:j]
4412/// ELSE
4413/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4414/// FI
4415/// ENDFOR
4416/// \endcode
4417///
4418/// \headerfile <immintrin.h>
4419///
4420/// \code
4421/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4422/// __m256i mask, const int s);
4423/// \endcode
4424///
4425/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4426///
4427/// \param a
4428/// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4429/// zero.
4430/// \param m
4431/// A pointer to the memory used for loading values.
4432/// \param i
4433/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4434/// \param mask
4435/// A 256-bit vector of [8 x i32] containing the mask. The most significant
4436/// bit of each element in the mask vector represents the mask bits. If a
4437/// mask bit is zero, the corresponding value from vector \a a is gathered;
4438/// otherwise the value is loaded from memory.
4439/// \param s
4440/// A literal constant scale factor for the indexes in \a i. Must be
4441/// 1, 2, 4, or 8.
4442/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4443#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4444 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4445 (int const *)(m), \
4446 (__v8si)(__m256i)(i), \
4447 (__v8si)(__m256i)(mask), (s)))
4448
4449/// Conditionally gathers two 32-bit integer values, either from the
4450/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4451/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4452/// of [4 x i32] in \a mask determines the source for the lower two
4453/// elements. The upper two elements of the result are zeroed.
4454///
4455/// \code{.operation}
4456/// FOR element := 0 to 1
4457/// j := element*32
4458/// k := element*64
4459/// IF mask[j+31] == 0
4460/// result[j+31:j] := a[j+31:j]
4461/// ELSE
4462/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4463/// FI
4464/// ENDFOR
4465/// result[127:64] := 0
4466/// \endcode
4467///
4468/// \headerfile <immintrin.h>
4469///
4470/// \code
4471/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4472/// __m128i mask, const int s);
4473/// \endcode
4474///
4475/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4476///
4477/// \param a
4478/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4479/// zero. Only the first two elements are used.
4480/// \param m
4481/// A pointer to the memory used for loading values.
4482/// \param i
4483/// A 128-bit vector of [2 x i64] containing indexes into \a m.
4484/// \param mask
4485/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4486/// bit of each element in the mask vector represents the mask bits. If a
4487/// mask bit is zero, the corresponding value from vector \a a is gathered;
4488/// otherwise the value is loaded from memory. Only the first two elements
4489/// are used.
4490/// \param s
4491/// A literal constant scale factor for the indexes in \a i. Must be
4492/// 1, 2, 4, or 8.
4493/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4494#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4495 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4496 (int const *)(m), \
4497 (__v2di)(__m128i)(i), \
4498 (__v4si)(__m128i)(mask), (s)))
4499
4500/// Conditionally gathers four 32-bit integer values, either from the
4501/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4502/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4503/// of [4 x i32] in \a mask determines the source for each element.
4504///
4505/// \code{.operation}
4506/// FOR element := 0 to 3
4507/// j := element*32
4508/// k := element*64
4509/// IF mask[j+31] == 0
4510/// result[j+31:j] := a[j+31:j]
4511/// ELSE
4512/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4513/// FI
4514/// ENDFOR
4515/// \endcode
4516///
4517/// \headerfile <immintrin.h>
4518///
4519/// \code
4520/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4521/// __m128i mask, const int s);
4522/// \endcode
4523///
4524/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4525///
4526/// \param a
4527/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4528/// zero.
4529/// \param m
4530/// A pointer to the memory used for loading values.
4531/// \param i
4532/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4533/// \param mask
4534/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4535/// bit of each element in the mask vector represents the mask bits. If a
4536/// mask bit is zero, the corresponding value from vector \a a is gathered;
4537/// otherwise the value is loaded from memory.
4538/// \param s
4539/// A literal constant scale factor for the indexes in \a i. Must be
4540/// 1, 2, 4, or 8.
4541/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4542#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4543 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4544 (int const *)(m), \
4545 (__v4di)(__m256i)(i), \
4546 (__v4si)(__m128i)(mask), (s)))
4547
4548/// Conditionally gathers two 64-bit integer values, either from the
4549/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4550/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4551/// of [2 x i64] in \a mask determines the source for each element.
4552///
4553/// \code{.operation}
4554/// FOR element := 0 to 1
4555/// j := element*64
4556/// k := element*32
4557/// IF mask[j+63] == 0
4558/// result[j+63:j] := a[j+63:j]
4559/// ELSE
4560/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4561/// FI
4562/// ENDFOR
4563/// \endcode
4564///
4565/// \headerfile <immintrin.h>
4566///
4567/// \code
4568/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4569/// __m128i mask, const int s);
4570/// \endcode
4571///
4572/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4573///
4574/// \param a
4575/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4576/// zero.
4577/// \param m
4578/// A pointer to the memory used for loading values.
4579/// \param i
4580/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4581/// the first two elements are used.
4582/// \param mask
4583/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4584/// bit of each element in the mask vector represents the mask bits. If a
4585/// mask bit is zero, the corresponding value from vector \a a is gathered;
4586/// otherwise the value is loaded from memory.
4587/// \param s
4588/// A literal constant scale factor for the indexes in \a i. Must be
4589/// 1, 2, 4, or 8.
4590/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4591#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4592 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4593 (long long const *)(m), \
4594 (__v4si)(__m128i)(i), \
4595 (__v2di)(__m128i)(mask), (s)))
4596
4597/// Conditionally gathers four 64-bit integer values, either from the
4598/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4599/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4600/// of [4 x i64] in \a mask determines the source for each element.
4601///
4602/// \code{.operation}
4603/// FOR element := 0 to 3
4604/// j := element*64
4605/// k := element*32
4606/// IF mask[j+63] == 0
4607/// result[j+63:j] := a[j+63:j]
4608/// ELSE
4609/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4610/// FI
4611/// ENDFOR
4612/// \endcode
4613///
4614/// \headerfile <immintrin.h>
4615///
4616/// \code
4617/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4618/// __m128i i, __m256i mask, const int s);
4619/// \endcode
4620///
4621/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4622///
4623/// \param a
4624/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4625/// zero.
4626/// \param m
4627/// A pointer to the memory used for loading values.
4628/// \param i
4629/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4630/// \param mask
4631/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4632/// bit of each element in the mask vector represents the mask bits. If a
4633/// mask bit is zero, the corresponding value from vector \a a is gathered;
4634/// otherwise the value is loaded from memory.
4635/// \param s
4636/// A literal constant scale factor for the indexes in \a i. Must be
4637/// 1, 2, 4, or 8.
4638/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4639#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4640 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4641 (long long const *)(m), \
4642 (__v4si)(__m128i)(i), \
4643 (__v4di)(__m256i)(mask), (s)))
4644
4645/// Conditionally gathers two 64-bit integer values, either from the
4646/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4647/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4648/// of [2 x i64] in \a mask determines the source for each element.
4649///
4650/// \code{.operation}
4651/// FOR element := 0 to 1
4652/// j := element*64
4653/// k := element*64
4654/// IF mask[j+63] == 0
4655/// result[j+63:j] := a[j+63:j]
4656/// ELSE
4657/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4658/// FI
4659/// ENDFOR
4660/// \endcode
4661///
4662/// \headerfile <immintrin.h>
4663///
4664/// \code
4665/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4666/// __m128i mask, const int s);
4667/// \endcode
4668///
4669/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4670///
4671/// \param a
4672/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4673/// zero.
4674/// \param m
4675/// A pointer to the memory used for loading values.
4676/// \param i
4677/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4678/// \param mask
4679/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4680/// bit of each element in the mask vector represents the mask bits. If a
4681/// mask bit is zero, the corresponding value from vector \a a is gathered;
4682/// otherwise the value is loaded from memory.
4683/// \param s
4684/// A literal constant scale factor for the indexes in \a i. Must be
4685/// 1, 2, 4, or 8.
4686/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4687#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4688 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4689 (long long const *)(m), \
4690 (__v2di)(__m128i)(i), \
4691 (__v2di)(__m128i)(mask), (s)))
4692
4693/// Conditionally gathers four 64-bit integer values, either from the
4694/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4695/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4696/// of [4 x i64] in \a mask determines the source for each element.
4697///
4698/// \code{.operation}
4699/// FOR element := 0 to 3
4700/// j := element*64
4701/// k := element*64
4702/// IF mask[j+63] == 0
4703/// result[j+63:j] := a[j+63:j]
4704/// ELSE
4705/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4706/// FI
4707/// ENDFOR
4708/// \endcode
4709///
4710/// \headerfile <immintrin.h>
4711///
4712/// \code
4713/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4714/// __m256i i, __m256i mask, const int s);
4715/// \endcode
4716///
4717/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4718///
4719/// \param a
4720/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4721/// zero.
4722/// \param m
4723/// A pointer to the memory used for loading values.
4724/// \param i
4725/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4726/// \param mask
4727/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4728/// bit of each element in the mask vector represents the mask bits. If a
4729/// mask bit is zero, the corresponding value from vector \a a is gathered;
4730/// otherwise the value is loaded from memory.
4731/// \param s
4732/// A literal constant scale factor for the indexes in \a i. Must be
4733/// 1, 2, 4, or 8.
4734/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4735#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4736 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4737 (long long const *)(m), \
4738 (__v4di)(__m256i)(i), \
4739 (__v4di)(__m256i)(mask), (s)))
4740
4741/// Gathers two 64-bit floating-point values from memory \a m using scaled
4742/// indexes from the 128-bit vector of [4 x i32] in \a i.
4743///
4744/// \code{.operation}
4745/// FOR element := 0 to 1
4746/// j := element*64
4747/// k := element*32
4748/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4749/// ENDFOR
4750/// \endcode
4751///
4752/// \headerfile <immintrin.h>
4753///
4754/// \code
4755/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4756/// \endcode
4757///
4758/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4759///
4760/// \param m
4761/// A pointer to the memory used for loading values.
4762/// \param i
4763/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4764/// the first two elements are used.
4765/// \param s
4766/// A literal constant scale factor for the indexes in \a i. Must be
4767/// 1, 2, 4, or 8.
4768/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4769#define _mm_i32gather_pd(m, i, s) \
4770 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4771 (double const *)(m), \
4772 (__v4si)(__m128i)(i), \
4773 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4774 _mm_setzero_pd()), \
4775 (s)))
4776
4777/// Gathers four 64-bit floating-point values from memory \a m using scaled
4778/// indexes from the 128-bit vector of [4 x i32] in \a i.
4779///
4780/// \code{.operation}
4781/// FOR element := 0 to 3
4782/// j := element*64
4783/// k := element*32
4784/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4785/// ENDFOR
4786/// \endcode
4787///
4788/// \headerfile <immintrin.h>
4789///
4790/// \code
4791/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4792/// \endcode
4793///
4794/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4795///
4796/// \param m
4797/// A pointer to the memory used for loading values.
4798/// \param i
4799/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4800/// \param s
4801/// A literal constant scale factor for the indexes in \a i. Must be
4802/// 1, 2, 4, or 8.
4803/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4804#define _mm256_i32gather_pd(m, i, s) \
4805 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4806 (double const *)(m), \
4807 (__v4si)(__m128i)(i), \
4808 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4809 _mm256_setzero_pd(), \
4810 _CMP_EQ_OQ), \
4811 (s)))
4812
4813/// Gathers two 64-bit floating-point values from memory \a m using scaled
4814/// indexes from the 128-bit vector of [2 x i64] in \a i.
4815///
4816/// \code{.operation}
4817/// FOR element := 0 to 1
4818/// j := element*64
4819/// k := element*64
4820/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4821/// ENDFOR
4822/// \endcode
4823///
4824/// \headerfile <immintrin.h>
4825///
4826/// \code
4827/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4828/// \endcode
4829///
4830/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4831///
4832/// \param m
4833/// A pointer to the memory used for loading values.
4834/// \param i
4835/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4836/// \param s
4837/// A literal constant scale factor for the indexes in \a i. Must be
4838/// 1, 2, 4, or 8.
4839/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4840#define _mm_i64gather_pd(m, i, s) \
4841 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4842 (double const *)(m), \
4843 (__v2di)(__m128i)(i), \
4844 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4845 _mm_setzero_pd()), \
4846 (s)))
4847
4848/// Gathers four 64-bit floating-point values from memory \a m using scaled
4849/// indexes from the 256-bit vector of [4 x i64] in \a i.
4850///
4851/// \code{.operation}
4852/// FOR element := 0 to 3
4853/// j := element*64
4854/// k := element*64
4855/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4856/// ENDFOR
4857/// \endcode
4858///
4859/// \headerfile <immintrin.h>
4860///
4861/// \code
4862/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4863/// \endcode
4864///
4865/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4866///
4867/// \param m
4868/// A pointer to the memory used for loading values.
4869/// \param i
4870/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4871/// \param s
4872/// A literal constant scale factor for the indexes in \a i. Must be
4873/// 1, 2, 4, or 8.
4874/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4875#define _mm256_i64gather_pd(m, i, s) \
4876 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4877 (double const *)(m), \
4878 (__v4di)(__m256i)(i), \
4879 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4880 _mm256_setzero_pd(), \
4881 _CMP_EQ_OQ), \
4882 (s)))
4883
4884/// Gathers four 32-bit floating-point values from memory \a m using scaled
4885/// indexes from the 128-bit vector of [4 x i32] in \a i.
4886///
4887/// \code{.operation}
4888/// FOR element := 0 to 3
4889/// j := element*32
4890/// k := element*32
4891/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4892/// ENDFOR
4893/// \endcode
4894///
4895/// \headerfile <immintrin.h>
4896///
4897/// \code
4898/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4899/// \endcode
4900///
4901/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4902///
4903/// \param m
4904/// A pointer to the memory used for loading values.
4905/// \param i
4906/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4907/// \param s
4908/// A literal constant scale factor for the indexes in \a i. Must be
4909/// 1, 2, 4, or 8.
4910/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4911#define _mm_i32gather_ps(m, i, s) \
4912 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4913 (float const *)(m), \
4914 (__v4si)(__m128i)(i), \
4915 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4916 _mm_setzero_ps()), \
4917 (s)))
4918
4919/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4920/// indexes from the 256-bit vector of [8 x i32] in \a i.
4921///
4922/// \code{.operation}
4923/// FOR element := 0 to 7
4924/// j := element*32
4925/// k := element*32
4926/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4927/// ENDFOR
4928/// \endcode
4929///
4930/// \headerfile <immintrin.h>
4931///
4932/// \code
4933/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4934/// \endcode
4935///
4936/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4937///
4938/// \param m
4939/// A pointer to the memory used for loading values.
4940/// \param i
4941/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4942/// \param s
4943/// A literal constant scale factor for the indexes in \a i. Must be
4944/// 1, 2, 4, or 8.
4945/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4946#define _mm256_i32gather_ps(m, i, s) \
4947 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4948 (float const *)(m), \
4949 (__v8si)(__m256i)(i), \
4950 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4951 _mm256_setzero_ps(), \
4952 _CMP_EQ_OQ), \
4953 (s)))
4954
4955/// Gathers two 32-bit floating-point values from memory \a m using scaled
4956/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4957/// elements of the result are zeroed.
4958///
4959/// \code{.operation}
4960/// FOR element := 0 to 1
4961/// j := element*32
4962/// k := element*64
4963/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4964/// ENDFOR
4965/// result[127:64] := 0
4966/// \endcode
4967///
4968/// \headerfile <immintrin.h>
4969///
4970/// \code
4971/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4972/// \endcode
4973///
4974/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4975///
4976/// \param m
4977/// A pointer to the memory used for loading values.
4978/// \param i
4979/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4980/// \param s
4981/// A literal constant scale factor for the indexes in \a i. Must be
4982/// 1, 2, 4, or 8.
4983/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4984#define _mm_i64gather_ps(m, i, s) \
4985 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4986 (float const *)(m), \
4987 (__v2di)(__m128i)(i), \
4988 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4989 _mm_setzero_ps()), \
4990 (s)))
4991
4992/// Gathers four 32-bit floating-point values from memory \a m using scaled
4993/// indexes from the 256-bit vector of [4 x i64] in \a i.
4994///
4995/// \code{.operation}
4996/// FOR element := 0 to 3
4997/// j := element*32
4998/// k := element*64
4999/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
5000/// ENDFOR
5001/// \endcode
5002///
5003/// \headerfile <immintrin.h>
5004///
5005/// \code
5006/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
5007/// \endcode
5008///
5009/// This intrinsic corresponds to the \c VGATHERQPS instruction.
5010///
5011/// \param m
5012/// A pointer to the memory used for loading values.
5013/// \param i
5014/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5015/// \param s
5016/// A literal constant scale factor for the indexes in \a i. Must be
5017/// 1, 2, 4, or 8.
5018/// \returns A 128-bit vector of [4 x float] containing the gathered values.
5019#define _mm256_i64gather_ps(m, i, s) \
5020 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5021 (float const *)(m), \
5022 (__v4di)(__m256i)(i), \
5023 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5024 _mm_setzero_ps()), \
5025 (s)))
5026
5027/// Gathers four 32-bit floating-point values from memory \a m using scaled
5028/// indexes from the 128-bit vector of [4 x i32] in \a i.
5029///
5030/// \code{.operation}
5031/// FOR element := 0 to 3
5032/// j := element*32
5033/// k := element*32
5034/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5035/// ENDFOR
5036/// \endcode
5037///
5038/// \headerfile <immintrin.h>
5039///
5040/// \code
5041/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5042/// \endcode
5043///
5044/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5045///
5046/// \param m
5047/// A pointer to the memory used for loading values.
5048/// \param i
5049/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5050/// \param s
5051/// A literal constant scale factor for the indexes in \a i. Must be
5052/// 1, 2, 4, or 8.
5053/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5054#define _mm_i32gather_epi32(m, i, s) \
5055 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5056 (int const *)(m), (__v4si)(__m128i)(i), \
5057 (__v4si)_mm_set1_epi32(-1), (s)))
5058
5059/// Gathers eight 32-bit floating-point values from memory \a m using scaled
5060/// indexes from the 256-bit vector of [8 x i32] in \a i.
5061///
5062/// \code{.operation}
5063/// FOR element := 0 to 7
5064/// j := element*32
5065/// k := element*32
5066/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5067/// ENDFOR
5068/// \endcode
5069///
5070/// \headerfile <immintrin.h>
5071///
5072/// \code
5073/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5074/// \endcode
5075///
5076/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5077///
5078/// \param m
5079/// A pointer to the memory used for loading values.
5080/// \param i
5081/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5082/// \param s
5083/// A literal constant scale factor for the indexes in \a i. Must be
5084/// 1, 2, 4, or 8.
5085/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5086#define _mm256_i32gather_epi32(m, i, s) \
5087 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5088 (int const *)(m), (__v8si)(__m256i)(i), \
5089 (__v8si)_mm256_set1_epi32(-1), (s)))
5090
5091/// Gathers two 32-bit integer values from memory \a m using scaled indexes
5092/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5093/// of the result are zeroed.
5094///
5095/// \code{.operation}
5096/// FOR element := 0 to 1
5097/// j := element*32
5098/// k := element*64
5099/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5100/// ENDFOR
5101/// result[127:64] := 0
5102/// \endcode
5103///
5104/// \headerfile <immintrin.h>
5105///
5106/// \code
5107/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5108/// \endcode
5109///
5110/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5111///
5112/// \param m
5113/// A pointer to the memory used for loading values.
5114/// \param i
5115/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5116/// \param s
5117/// A literal constant scale factor for the indexes in \a i. Must be
5118/// 1, 2, 4, or 8.
5119/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5120#define _mm_i64gather_epi32(m, i, s) \
5121 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5122 (int const *)(m), (__v2di)(__m128i)(i), \
5123 (__v4si)_mm_set1_epi32(-1), (s)))
5124
5125/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5126/// from the 256-bit vector of [4 x i64] in \a i.
5127///
5128/// \code{.operation}
5129/// FOR element := 0 to 3
5130/// j := element*32
5131/// k := element*64
5132/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5133/// ENDFOR
5134/// \endcode
5135///
5136/// \headerfile <immintrin.h>
5137///
5138/// \code
5139/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5140/// \endcode
5141///
5142/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5143///
5144/// \param m
5145/// A pointer to the memory used for loading values.
5146/// \param i
5147/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5148/// \param s
5149/// A literal constant scale factor for the indexes in \a i. Must be
5150/// 1, 2, 4, or 8.
5151/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5152#define _mm256_i64gather_epi32(m, i, s) \
5153 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5154 (int const *)(m), (__v4di)(__m256i)(i), \
5155 (__v4si)_mm_set1_epi32(-1), (s)))
5156
5157/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5158/// from the 128-bit vector of [4 x i32] in \a i.
5159///
5160/// \code{.operation}
5161/// FOR element := 0 to 1
5162/// j := element*64
5163/// k := element*32
5164/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5165/// ENDFOR
5166/// \endcode
5167///
5168/// \headerfile <immintrin.h>
5169///
5170/// \code
5171/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5172/// \endcode
5173///
5174/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5175///
5176/// \param m
5177/// A pointer to the memory used for loading values.
5178/// \param i
5179/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5180/// the first two elements are used.
5181/// \param s
5182/// A literal constant scale factor for the indexes in \a i. Must be
5183/// 1, 2, 4, or 8.
5184/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5185#define _mm_i32gather_epi64(m, i, s) \
5186 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5187 (long long const *)(m), \
5188 (__v4si)(__m128i)(i), \
5189 (__v2di)_mm_set1_epi64x(-1), (s)))
5190
5191/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5192/// from the 128-bit vector of [4 x i32] in \a i.
5193///
5194/// \code{.operation}
5195/// FOR element := 0 to 3
5196/// j := element*64
5197/// k := element*32
5198/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5199/// ENDFOR
5200/// \endcode
5201///
5202/// \headerfile <immintrin.h>
5203///
5204/// \code
5205/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5206/// \endcode
5207///
5208/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5209///
5210/// \param m
5211/// A pointer to the memory used for loading values.
5212/// \param i
5213/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5214/// \param s
5215/// A literal constant scale factor for the indexes in \a i. Must be
5216/// 1, 2, 4, or 8.
5217/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5218#define _mm256_i32gather_epi64(m, i, s) \
5219 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5220 (long long const *)(m), \
5221 (__v4si)(__m128i)(i), \
5222 (__v4di)_mm256_set1_epi64x(-1), (s)))
5223
5224/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5225/// from the 128-bit vector of [2 x i64] in \a i.
5226///
5227/// \code{.operation}
5228/// FOR element := 0 to 1
5229/// j := element*64
5230/// k := element*64
5231/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5232/// ENDFOR
5233/// \endcode
5234///
5235/// \headerfile <immintrin.h>
5236///
5237/// \code
5238/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5239/// \endcode
5240///
5241/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5242///
5243/// \param m
5244/// A pointer to the memory used for loading values.
5245/// \param i
5246/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5247/// \param s
5248/// A literal constant scale factor for the indexes in \a i. Must be
5249/// 1, 2, 4, or 8.
5250/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5251#define _mm_i64gather_epi64(m, i, s) \
5252 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5253 (long long const *)(m), \
5254 (__v2di)(__m128i)(i), \
5255 (__v2di)_mm_set1_epi64x(-1), (s)))
5256
5257/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5258/// from the 256-bit vector of [4 x i64] in \a i.
5259///
5260/// \code{.operation}
5261/// FOR element := 0 to 3
5262/// j := element*64
5263/// k := element*64
5264/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5265/// ENDFOR
5266/// \endcode
5267///
5268/// \headerfile <immintrin.h>
5269///
5270/// \code
5271/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5272/// \endcode
5273///
5274/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5275///
5276/// \param m
5277/// A pointer to the memory used for loading values.
5278/// \param i
5279/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5280/// \param s
5281/// A literal constant scale factor for the indexes in \a i. Must be
5282/// 1, 2, 4, or 8.
5283/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5284#define _mm256_i64gather_epi64(m, i, s) \
5285 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5286 (long long const *)(m), \
5287 (__v4di)(__m256i)(i), \
5288 (__v4di)_mm256_set1_epi64x(-1), (s)))
5289
5290#undef __DEFAULT_FN_ATTRS256
5291#undef __DEFAULT_FN_ATTRS128
5292
5293#endif /* __AVX2INTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
Definition: avx2intrin.h:2443
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits,...
Definition: avx2intrin.h:2462
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi8(__m256i __a)
Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each va...
Definition: avx2intrin.h:108
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
Definition: avx2intrin.h:2403
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi32(__m128i __V)
Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
Definition: avx2intrin.h:1552
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and re...
Definition: avx2intrin.h:857
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memo...
Definition: avx2intrin.h:3678
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_and_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vectors in __a and __b.
Definition: avx2intrin.h:464
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi32(__m128i __V)
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
Definition: avx2intrin.h:1450
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256(const void *__V)
Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vec...
Definition: avx2intrin.h:2995
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
Definition: avx2intrin.h:1185
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given...
Definition: avx2intrin.h:2199
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
Definition: avx2intrin.h:2960
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
Definition: avx2intrin.h:3268
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given...
Definition: avx2intrin.h:2239
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
Definition: avx2intrin.h:1299
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi32(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation,...
Definition: avx2intrin.h:205
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64(long long const *__X, __m256i __M)
Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the ...
Definition: avx2intrin.h:3554
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi32(__m128i __V)
Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
Definition: avx2intrin.h:1395
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi16(__m128i __V)
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
Definition: avx2intrin.h:1367
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to mem...
Definition: avx2intrin.h:3648
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns t...
Definition: avx2intrin.h:637
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits,...
Definition: avx2intrin.h:2178
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256...
Definition: avx2intrin.h:1910
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and...
Definition: avx2intrin.h:1679
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32(int const *__X, __m256i __M)
Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the...
Definition: avx2intrin.h:3522
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
Definition: avx2intrin.h:2301
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.
Definition: avx2intrin.h:3236
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi16(__m128i __V)
Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
Definition: avx2intrin.h:1526
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned satur...
Definition: avx2intrin.h:2668
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
Definition: avx2intrin.h:1223
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower ...
Definition: avx2intrin.h:1782
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].
Definition: avx2intrin.h:2537
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32...
Definition: avx2intrin.h:3312
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than an...
Definition: avx2intrin.h:741
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
Definition: avx2intrin.h:1109
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi64(__m256i __a, __m256i __b)
Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].
Definition: avx2intrin.h:2589
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given...
Definition: avx2intrin.h:3804
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater...
Definition: avx2intrin.h:795
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed by...
Definition: avx2intrin.h:1058
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
Definition: avx2intrin.h:2728
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and r...
Definition: avx2intrin.h:960
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
Definition: avx2intrin.h:2930
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned ...
Definition: avx2intrin.h:418
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastsi128_si256(__m128i __X)
Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result...
Definition: avx2intrin.h:3080
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
Definition: avx2intrin.h:1204
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit ...
Definition: avx2intrin.h:1706
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater...
Definition: avx2intrin.h:821
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits spec...
Definition: avx2intrin.h:2159
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memor...
Definition: avx2intrin.h:3738
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srav_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
Definition: avx2intrin.h:3849
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 b...
Definition: avx2intrin.h:287
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_broadcastsd_pd(__m128d __X)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
Definition: avx2intrin.h:3064
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the...
Definition: avx2intrin.h:306
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x f...
Definition: avx2intrin.h:3370
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_xor_si256(__m256i __a, __m256i __b)
Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.
Definition: avx2intrin.h:2978
static __inline__ int __DEFAULT_FN_ATTRS256 _mm256_movemask_epi8(__m256i __a)
Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vecto...
Definition: avx2intrin.h:1341
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturat...
Definition: avx2intrin.h:2615
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
Definition: avx2intrin.h:3916
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits give...
Definition: avx2intrin.h:3960
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
Definition: avx2intrin.h:1166
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the ...
Definition: avx2intrin.h:569
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi64(__m256i __a, __m256i __b)
Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the ...
Definition: avx2intrin.h:344
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __...
Definition: avx2intrin.h:2079
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32(int const *__X, __m128i __M)
Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the ...
Definition: avx2intrin.h:3586
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given...
Definition: avx2intrin.h:3782
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
Definition: avx2intrin.h:1242
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadds_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using ...
Definition: avx2intrin.h:924
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
Definition: avx2intrin.h:1128
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
Definition: avx2intrin.h:2797
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
Definition: avx2intrin.h:1147
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given...
Definition: avx2intrin.h:3760
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and re...
Definition: avx2intrin.h:992
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.
Definition: avx2intrin.h:3172
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
Definition: avx2intrin.h:3013
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
Definition: avx2intrin.h:2763
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits give...
Definition: avx2intrin.h:3938
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a, __m256i __b)
Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a,...
Definition: avx2intrin.h:2037
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper...
Definition: avx2intrin.h:1744
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greate...
Definition: avx2intrin.h:769
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu16_epi32(__m128i __V)
Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
Definition: avx2intrin.h:1603
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and r...
Definition: avx2intrin.h:715
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi16(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation,...
Definition: avx2intrin.h:173
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using...
Definition: avx2intrin.h:1028
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
Definition: avx2intrin.h:3252
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and ...
Definition: avx2intrin.h:663
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memo...
Definition: avx2intrin.h:3708
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.
Definition: avx2intrin.h:2510
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upp...
Definition: avx2intrin.h:1725
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits,...
Definition: avx2intrin.h:2138
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
Definition: avx2intrin.h:2323
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srav_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
Definition: avx2intrin.h:3872
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
Definition: avx2intrin.h:1261
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi64(__m128i __V)
Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
Definition: avx2intrin.h:1577
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16(__m256i __a, __m256i __b)
Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit interme...
Definition: avx2intrin.h:1090
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_broadcastsd_pd(__m128d __a)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
Definition: avx2intrin.h:3030
#define __DEFAULT_FN_ATTRS256
Definition: avx2intrin.h:26
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
Definition: avx2intrin.h:3894
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16...
Definition: avx2intrin.h:534
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
Definition: avx2intrin.h:1280
#define __DEFAULT_FN_ATTRS128
Definition: avx2intrin.h:29
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation,...
Definition: avx2intrin.h:268
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
Definition: avx2intrin.h:2861
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using sign...
Definition: avx2intrin.h:2641
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and r...
Definition: avx2intrin.h:689
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
Definition: avx2intrin.h:1318
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi32(__m256i __a, __m256i __b)
Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].
Definition: avx2intrin.h:2563
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu16_epi64(__m128i __V)
Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
Definition: avx2intrin.h:1628
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
Definition: avx2intrin.h:3188
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower...
Definition: avx2intrin.h:1763
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
Definition: avx2intrin.h:2259
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
Definition: avx2intrin.h:2896
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi32_epi64(__m128i __V)
Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
Definition: avx2intrin.h:1500
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation...
Definition: avx2intrin.h:400
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
Definition: avx2intrin.h:2827
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi16(__m256i __a, __m256i __b)
Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation,...
Definition: avx2intrin.h:236
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation,...
Definition: avx2intrin.h:363
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result...
Definition: avx2intrin.h:3284
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits give...
Definition: avx2intrin.h:2483
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
Definition: avx2intrin.h:2422
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_andnot_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit int...
Definition: avx2intrin.h:482
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits,...
Definition: avx2intrin.h:2218
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu32_epi64(__m128i __V)
Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
Definition: avx2intrin.h:1653
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
Definition: avx2intrin.h:2281
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
Definition: avx2intrin.h:3204
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi64(__m128i __V)
Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
Definition: avx2intrin.h:1475
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64(long long const *__X, __m128i __M)
Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the c...
Definition: avx2intrin.h:3618
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi32(__m256i __a)
Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a a...
Definition: avx2intrin.h:142
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi64(__m128i __V)
Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
Definition: avx2intrin.h:1422
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in _...
Definition: avx2intrin.h:2058
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and ret...
Definition: avx2intrin.h:889
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epu32(__m256i __a, __m256i __b)
Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] an...
Definition: avx2intrin.h:1808
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given...
Definition: avx2intrin.h:3826
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a, __m256i __b)
Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers fr...
Definition: avx2intrin.h:1871
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi16(__m256i __a)
Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a ...
Definition: avx2intrin.h:125
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed sa...
Definition: avx2intrin.h:381
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
Definition: avx2intrin.h:3047
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a a...
Definition: avx2intrin.h:508
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi32(__m256i __a, __m256i __b)
Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the ...
Definition: avx2intrin.h:325
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_or_si256(__m256i __a, __m256i __b)
Computes the bitwise OR of the 256-bit integer vectors in __a and __b.
Definition: avx2intrin.h:1826
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsi...
Definition: avx2intrin.h:2694
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result'...
Definition: avx2intrin.h:3220
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
Definition: avx2intrin.h:2382
static __inline__ void int __a
Definition: emmintrin.h:4079
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19