clang 22.0.0git
avx2intrin.h
Go to the documentation of this file.
1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX2INTRIN_H
15#define __AVX2INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
20 __min_vector_width__(256)))
21#define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
23 __min_vector_width__(128)))
24
25#if defined(__cplusplus) && (__cplusplus >= 201103L)
26#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
27#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
28#else
29#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
30#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
31#endif
32
33/* SSE4 Multiple Packed Sums of Absolute Difference. */
34/// Computes sixteen sum of absolute difference (SAD) operations on sets of
35/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
36/// \a Y.
37///
38/// Eight SAD results are computed using the lower half of the input
39/// vectors, and another eight using the upper half. These 16-bit values
40/// are returned in the lower and upper halves of the 256-bit result,
41/// respectively.
42///
43/// A single SAD operation selects four bytes from \a X and four bytes from
44/// \a Y as input. It computes the differences between each \a X byte and
45/// the corresponding \a Y byte, takes the absolute value of each
46/// difference, and sums these four values to form one 16-bit result. The
47/// intrinsic computes 16 of these results with different sets of input
48/// bytes.
49///
50/// For each set of eight results, the SAD operations use the same four
51/// bytes from \a Y; the starting bit position for these four bytes is
52/// specified by \a M[1:0] times 32. The eight operations use successive
53/// sets of four bytes from \a X; the starting bit position for the first
54/// set of four bytes is specified by \a M[2] times 32. These bit positions
55/// are all relative to the 128-bit lane for each set of eight operations.
56///
57/// \code{.operation}
58/// r := 0
59/// FOR i := 0 TO 1
60/// j := i*3
61/// Ybase := M[j+1:j]*32 + i*128
62/// Xbase := M[j+2]*32 + i*128
63/// FOR k := 0 TO 3
64/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
65/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
66/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
67/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
68/// result[r+15:r] := temp0 + temp1 + temp2 + temp3
69/// Xbase := Xbase + 8
70/// r := r + 16
71/// ENDFOR
72/// ENDFOR
73/// \endcode
74///
75/// \headerfile <immintrin.h>
76///
77/// \code
78/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
79/// \endcode
80///
81/// This intrinsic corresponds to the \c VMPSADBW instruction.
82///
83/// \param X
84/// A 256-bit integer vector containing one of the inputs.
85/// \param Y
86/// A 256-bit integer vector containing one of the inputs.
87/// \param M
88/// An unsigned immediate value specifying the starting positions of the
89/// bytes to operate on.
90/// \returns A 256-bit vector of [16 x i16] containing the result.
91#define _mm256_mpsadbw_epu8(X, Y, M) \
92 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
93 (__v32qi)(__m256i)(Y), (int)(M)))
94
95/// Computes the absolute value of each signed byte in the 256-bit integer
96/// vector \a __a and returns each value in the corresponding byte of
97/// the result.
98///
99/// \headerfile <immintrin.h>
100///
101/// This intrinsic corresponds to the \c VPABSB instruction.
102///
103/// \param __a
104/// A 256-bit integer vector.
105/// \returns A 256-bit integer vector containing the result.
106static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
108 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
109}
110
111/// Computes the absolute value of each signed 16-bit element in the 256-bit
112/// vector of [16 x i16] in \a __a and returns each value in the
113/// corresponding element of the result.
114///
115/// \headerfile <immintrin.h>
116///
117/// This intrinsic corresponds to the \c VPABSW instruction.
118///
119/// \param __a
120/// A 256-bit vector of [16 x i16].
121/// \returns A 256-bit vector of [16 x i16] containing the result.
122static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
124 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
125}
126
127/// Computes the absolute value of each signed 32-bit element in the 256-bit
128/// vector of [8 x i32] in \a __a and returns each value in the
129/// corresponding element of the result.
130///
131/// \headerfile <immintrin.h>
132///
133/// This intrinsic corresponds to the \c VPABSD instruction.
134///
135/// \param __a
136/// A 256-bit vector of [8 x i32].
137/// \returns A 256-bit vector of [8 x i32] containing the result.
138static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
140 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
141}
142
143/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
144/// integers using signed saturation, and returns the 256-bit result.
145///
146/// \code{.operation}
147/// FOR i := 0 TO 7
148/// j := i*16
149/// k := i*8
150/// result[7+k:k] := SATURATE8(__a[15+j:j])
151/// result[71+k:64+k] := SATURATE8(__b[15+j:j])
152/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
153/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
154/// ENDFOR
155/// \endcode
156///
157/// \headerfile <immintrin.h>
158///
159/// This intrinsic corresponds to the \c VPACKSSWB instruction.
160///
161/// \param __a
162/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
163/// result[191:128].
164/// \param __b
165/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
166/// result[255:192].
167/// \returns A 256-bit integer vector containing the result.
168static __inline__ __m256i __DEFAULT_FN_ATTRS256
169_mm256_packs_epi16(__m256i __a, __m256i __b)
170{
171 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
172}
173
174/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
175/// integers using signed saturation, and returns the resulting 256-bit
176/// vector of [16 x i16].
177///
178/// \code{.operation}
179/// FOR i := 0 TO 3
180/// j := i*32
181/// k := i*16
182/// result[15+k:k] := SATURATE16(__a[31+j:j])
183/// result[79+k:64+k] := SATURATE16(__b[31+j:j])
184/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
185/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
186/// ENDFOR
187/// \endcode
188///
189/// \headerfile <immintrin.h>
190///
191/// This intrinsic corresponds to the \c VPACKSSDW instruction.
192///
193/// \param __a
194/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
195/// result[191:128].
196/// \param __b
197/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
198/// result[255:192].
199/// \returns A 256-bit vector of [16 x i16] containing the result.
200static __inline__ __m256i __DEFAULT_FN_ATTRS256
201_mm256_packs_epi32(__m256i __a, __m256i __b)
202{
203 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
204}
205
206/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
207/// using unsigned saturation, and returns the 256-bit result.
208///
209/// \code{.operation}
210/// FOR i := 0 TO 7
211/// j := i*16
212/// k := i*8
213/// result[7+k:k] := SATURATE8U(__a[15+j:j])
214/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
215/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
216/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
217/// ENDFOR
218/// \endcode
219///
220/// \headerfile <immintrin.h>
221///
222/// This intrinsic corresponds to the \c VPACKUSWB instruction.
223///
224/// \param __a
225/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
226/// result[191:128].
227/// \param __b
228/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
229/// result[255:192].
230/// \returns A 256-bit integer vector containing the result.
231static __inline__ __m256i __DEFAULT_FN_ATTRS256
232_mm256_packus_epi16(__m256i __a, __m256i __b)
233{
234 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
235}
236
237/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
238/// using unsigned saturation, and returns the resulting 256-bit vector of
239/// [16 x i16].
240///
241/// \code{.operation}
242/// FOR i := 0 TO 3
243/// j := i*32
244/// k := i*16
245/// result[15+k:k] := SATURATE16U(__V1[31+j:j])
246/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
247/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
248/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
249/// ENDFOR
250/// \endcode
251///
252/// \headerfile <immintrin.h>
253///
254/// This intrinsic corresponds to the \c VPACKUSDW instruction.
255///
256/// \param __V1
257/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
258/// result[191:128].
259/// \param __V2
260/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
261/// result[255:192].
262/// \returns A 256-bit vector of [16 x i16] containing the result.
263static __inline__ __m256i __DEFAULT_FN_ATTRS256
264_mm256_packus_epi32(__m256i __V1, __m256i __V2)
265{
266 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
267}
268
269/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
270/// vectors and returns the lower 8 bits of each sum in the corresponding
271/// byte of the 256-bit integer vector result (overflow is ignored).
272///
273/// \headerfile <immintrin.h>
274///
275/// This intrinsic corresponds to the \c VPADDB instruction.
276///
277/// \param __a
278/// A 256-bit integer vector containing one of the source operands.
279/// \param __b
280/// A 256-bit integer vector containing one of the source operands.
281/// \returns A 256-bit integer vector containing the sums.
282static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
283_mm256_add_epi8(__m256i __a, __m256i __b) {
284 return (__m256i)((__v32qu)__a + (__v32qu)__b);
285}
286
287/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
288/// [16 x i16] and returns the lower 16 bits of each sum in the
289/// corresponding element of the [16 x i16] result (overflow is ignored).
290///
291/// \headerfile <immintrin.h>
292///
293/// This intrinsic corresponds to the \c VPADDW instruction.
294///
295/// \param __a
296/// A 256-bit vector of [16 x i16] containing one of the source operands.
297/// \param __b
298/// A 256-bit vector of [16 x i16] containing one of the source operands.
299/// \returns A 256-bit vector of [16 x i16] containing the sums.
300static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
301_mm256_add_epi16(__m256i __a, __m256i __b) {
302 return (__m256i)((__v16hu)__a + (__v16hu)__b);
303}
304
305/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
306/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
307/// element of the [8 x i32] result (overflow is ignored).
308///
309/// \headerfile <immintrin.h>
310///
311/// This intrinsic corresponds to the \c VPADDD instruction.
312///
313/// \param __a
314/// A 256-bit vector of [8 x i32] containing one of the source operands.
315/// \param __b
316/// A 256-bit vector of [8 x i32] containing one of the source operands.
317/// \returns A 256-bit vector of [8 x i32] containing the sums.
318static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
319_mm256_add_epi32(__m256i __a, __m256i __b) {
320 return (__m256i)((__v8su)__a + (__v8su)__b);
321}
322
323/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
324/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
325/// element of the [4 x i64] result (overflow is ignored).
326///
327/// \headerfile <immintrin.h>
328///
329/// This intrinsic corresponds to the \c VPADDQ instruction.
330///
331/// \param __a
332/// A 256-bit vector of [4 x i64] containing one of the source operands.
333/// \param __b
334/// A 256-bit vector of [4 x i64] containing one of the source operands.
335/// \returns A 256-bit vector of [4 x i64] containing the sums.
336static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
337_mm256_add_epi64(__m256i __a, __m256i __b) {
338 return (__m256i)((__v4du)__a + (__v4du)__b);
339}
340
341/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
342/// vectors using signed saturation, and returns each sum in the
343/// corresponding byte of the 256-bit integer vector result.
344///
345/// \headerfile <immintrin.h>
346///
347/// This intrinsic corresponds to the \c VPADDSB instruction.
348///
349/// \param __a
350/// A 256-bit integer vector containing one of the source operands.
351/// \param __b
352/// A 256-bit integer vector containing one of the source operands.
353/// \returns A 256-bit integer vector containing the sums.
354static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
355_mm256_adds_epi8(__m256i __a, __m256i __b) {
356 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
357}
358
359/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
360/// [16 x i16] using signed saturation, and returns the [16 x i16] result.
361///
362/// \headerfile <immintrin.h>
363///
364/// This intrinsic corresponds to the \c VPADDSW instruction.
365///
366/// \param __a
367/// A 256-bit vector of [16 x i16] containing one of the source operands.
368/// \param __b
369/// A 256-bit vector of [16 x i16] containing one of the source operands.
370/// \returns A 256-bit vector of [16 x i16] containing the sums.
371static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
372_mm256_adds_epi16(__m256i __a, __m256i __b) {
373 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
374}
375
376/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
377/// vectors using unsigned saturation, and returns each sum in the
378/// corresponding byte of the 256-bit integer vector result.
379///
380/// \headerfile <immintrin.h>
381///
382/// This intrinsic corresponds to the \c VPADDUSB instruction.
383///
384/// \param __a
385/// A 256-bit integer vector containing one of the source operands.
386/// \param __b
387/// A 256-bit integer vector containing one of the source operands.
388/// \returns A 256-bit integer vector containing the sums.
389static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
390_mm256_adds_epu8(__m256i __a, __m256i __b) {
391 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
392}
393
394/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
395/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
396///
397/// \headerfile <immintrin.h>
398///
399/// This intrinsic corresponds to the \c VPADDUSW instruction.
400///
401/// \param __a
402/// A 256-bit vector of [16 x i16] containing one of the source operands.
403/// \param __b
404/// A 256-bit vector of [16 x i16] containing one of the source operands.
405/// \returns A 256-bit vector of [16 x i16] containing the sums.
406static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
407_mm256_adds_epu16(__m256i __a, __m256i __b) {
408 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
409}
410
411/// Uses the lower half of the 256-bit vector \a a as the upper half of a
412/// temporary 256-bit value, and the lower half of the 256-bit vector \a b
413/// as the lower half of the temporary value. Right-shifts the temporary
414/// value by \a n bytes, and uses the lower 16 bytes of the shifted value
415/// as the lower 16 bytes of the result. Uses the upper halves of \a a and
416/// \a b to make another temporary value, right shifts by \a n, and uses
417/// the lower 16 bytes of the shifted value as the upper 16 bytes of the
418/// result.
419///
420/// \headerfile <immintrin.h>
421///
422/// \code
423/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
424/// \endcode
425///
426/// This intrinsic corresponds to the \c VPALIGNR instruction.
427///
428/// \param a
429/// A 256-bit integer vector containing source values.
430/// \param b
431/// A 256-bit integer vector containing source values.
432/// \param n
433/// An immediate value specifying the number of bytes to shift.
434/// \returns A 256-bit integer vector containing the result.
435#define _mm256_alignr_epi8(a, b, n) \
436 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
437 (__v32qi)(__m256i)(b), (n)))
438
439/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
440/// \a __b.
441///
442/// \headerfile <immintrin.h>
443///
444/// This intrinsic corresponds to the \c VPAND instruction.
445///
446/// \param __a
447/// A 256-bit integer vector.
448/// \param __b
449/// A 256-bit integer vector.
450/// \returns A 256-bit integer vector containing the result.
451static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
452_mm256_and_si256(__m256i __a, __m256i __b)
453{
454 return (__m256i)((__v4du)__a & (__v4du)__b);
455}
456
457/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
458/// the bitwise NOT of the 256-bit integer vector in \a __a.
459///
460/// \headerfile <immintrin.h>
461///
462/// This intrinsic corresponds to the \c VPANDN instruction.
463///
464/// \param __a
465/// A 256-bit integer vector.
466/// \param __b
467/// A 256-bit integer vector.
468/// \returns A 256-bit integer vector containing the result.
469static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
470_mm256_andnot_si256(__m256i __a, __m256i __b)
471{
472 return (__m256i)(~(__v4du)__a & (__v4du)__b);
473}
474
475/// Computes the averages of the corresponding unsigned bytes in the two
476/// 256-bit integer vectors in \a __a and \a __b and returns each
477/// average in the corresponding byte of the 256-bit result.
478///
479/// \code{.operation}
480/// FOR i := 0 TO 31
481/// j := i*8
482/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
483/// ENDFOR
484/// \endcode
485///
486/// \headerfile <immintrin.h>
487///
488/// This intrinsic corresponds to the \c VPAVGB instruction.
489///
490/// \param __a
491/// A 256-bit integer vector.
492/// \param __b
493/// A 256-bit integer vector.
494/// \returns A 256-bit integer vector containing the result.
495static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
496_mm256_avg_epu8(__m256i __a, __m256i __b) {
497 return (__m256i)__builtin_ia32_pavgb256((__v32qu)__a, (__v32qu)__b);
498}
499
500/// Computes the averages of the corresponding unsigned 16-bit integers in
501/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
502/// each average in the corresponding element of the 256-bit result.
503///
504/// \code{.operation}
505/// FOR i := 0 TO 15
506/// j := i*16
507/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
508/// ENDFOR
509/// \endcode
510///
511/// \headerfile <immintrin.h>
512///
513/// This intrinsic corresponds to the \c VPAVGW instruction.
514///
515/// \param __a
516/// A 256-bit vector of [16 x i16].
517/// \param __b
518/// A 256-bit vector of [16 x i16].
519/// \returns A 256-bit vector of [16 x i16] containing the result.
520static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
521_mm256_avg_epu16(__m256i __a, __m256i __b) {
522 return (__m256i)__builtin_ia32_pavgw256((__v16hu)__a, (__v16hu)__b);
523}
524
525/// Merges 8-bit integer values from either of the two 256-bit vectors
526/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
527/// the resulting 256-bit integer vector.
528///
529/// \code{.operation}
530/// FOR i := 0 TO 31
531/// j := i*8
532/// IF __M[7+i] == 0
533/// result[7+j:j] := __V1[7+j:j]
534/// ELSE
535/// result[7+j:j] := __V2[7+j:j]
536/// FI
537/// ENDFOR
538/// \endcode
539///
540/// \headerfile <immintrin.h>
541///
542/// This intrinsic corresponds to the \c VPBLENDVB instruction.
543///
544/// \param __V1
545/// A 256-bit integer vector containing source values.
546/// \param __V2
547/// A 256-bit integer vector containing source values.
548/// \param __M
549/// A 256-bit integer vector, with bit [7] of each byte specifying the
550/// source for each corresponding byte of the result. When the mask bit
551/// is 0, the byte is copied from \a __V1; otherwise, it is copied from
552/// \a __V2.
553/// \returns A 256-bit integer vector containing the result.
554static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
555_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) {
556 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
557 (__v32qi)__M);
558}
559
560/// Merges 16-bit integer values from either of the two 256-bit vectors
561/// \a V1 or \a V2, as specified by the immediate integer operand \a M,
562/// and returns the resulting 256-bit vector of [16 x i16].
563///
564/// \code{.operation}
565/// FOR i := 0 TO 7
566/// j := i*16
567/// IF M[i] == 0
568/// result[7+j:j] := V1[7+j:j]
569/// result[135+j:128+j] := V1[135+j:128+j]
570/// ELSE
571/// result[7+j:j] := V2[7+j:j]
572/// result[135+j:128+j] := V2[135+j:128+j]
573/// FI
574/// ENDFOR
575/// \endcode
576///
577/// \headerfile <immintrin.h>
578///
579/// \code
580/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
581/// \endcode
582///
583/// This intrinsic corresponds to the \c VPBLENDW instruction.
584///
585/// \param V1
586/// A 256-bit vector of [16 x i16] containing source values.
587/// \param V2
588/// A 256-bit vector of [16 x i16] containing source values.
589/// \param M
590/// An immediate 8-bit integer operand, with bits [7:0] specifying the
591/// source for each element of the result. The position of the mask bit
592/// corresponds to the index of a copied value. When a mask bit is 0, the
593/// element is copied from \a V1; otherwise, it is copied from \a V2.
594/// \a M[0] determines the source for elements 0 and 8, \a M[1] for
595/// elements 1 and 9, and so forth.
596/// \returns A 256-bit vector of [16 x i16] containing the result.
597#define _mm256_blend_epi16(V1, V2, M) \
598 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
599 (__v16hi)(__m256i)(V2), (int)(M)))
600
601/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
602/// \a __b for equality and returns the outcomes in the corresponding
603/// bytes of the 256-bit result.
604///
605/// \code{.operation}
606/// FOR i := 0 TO 31
607/// j := i*8
608/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
609/// ENDFOR
610/// \endcode
611///
612/// \headerfile <immintrin.h>
613///
614/// This intrinsic corresponds to the \c VPCMPEQB instruction.
615///
616/// \param __a
617/// A 256-bit integer vector containing one of the inputs.
618/// \param __b
619/// A 256-bit integer vector containing one of the inputs.
620/// \returns A 256-bit integer vector containing the result.
621static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
622_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
623{
624 return (__m256i)((__v32qi)__a == (__v32qi)__b);
625}
626
627/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
628/// \a __a and \a __b for equality and returns the outcomes in the
629/// corresponding elements of the 256-bit result.
630///
631/// \code{.operation}
632/// FOR i := 0 TO 15
633/// j := i*16
634/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
635/// ENDFOR
636/// \endcode
637///
638/// \headerfile <immintrin.h>
639///
640/// This intrinsic corresponds to the \c VPCMPEQW instruction.
641///
642/// \param __a
643/// A 256-bit vector of [16 x i16] containing one of the inputs.
644/// \param __b
645/// A 256-bit vector of [16 x i16] containing one of the inputs.
646/// \returns A 256-bit vector of [16 x i16] containing the result.
647static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
648_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
649{
650 return (__m256i)((__v16hi)__a == (__v16hi)__b);
651}
652
653/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
654/// \a __a and \a __b for equality and returns the outcomes in the
655/// corresponding elements of the 256-bit result.
656///
657/// \code{.operation}
658/// FOR i := 0 TO 7
659/// j := i*32
660/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
661/// ENDFOR
662/// \endcode
663///
664/// \headerfile <immintrin.h>
665///
666/// This intrinsic corresponds to the \c VPCMPEQD instruction.
667///
668/// \param __a
669/// A 256-bit vector of [8 x i32] containing one of the inputs.
670/// \param __b
671/// A 256-bit vector of [8 x i32] containing one of the inputs.
672/// \returns A 256-bit vector of [8 x i32] containing the result.
673static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
674_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
675{
676 return (__m256i)((__v8si)__a == (__v8si)__b);
677}
678
679/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
680/// \a __a and \a __b for equality and returns the outcomes in the
681/// corresponding elements of the 256-bit result.
682///
683/// \code{.operation}
684/// FOR i := 0 TO 3
685/// j := i*64
686/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
687/// ENDFOR
688/// \endcode
689///
690/// \headerfile <immintrin.h>
691///
692/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
693///
694/// \param __a
695/// A 256-bit vector of [4 x i64] containing one of the inputs.
696/// \param __b
697/// A 256-bit vector of [4 x i64] containing one of the inputs.
698/// \returns A 256-bit vector of [4 x i64] containing the result.
699static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
700_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
701{
702 return (__m256i)((__v4di)__a == (__v4di)__b);
703}
704
705/// Compares corresponding signed bytes in the 256-bit integer vectors in
706/// \a __a and \a __b for greater-than and returns the outcomes in the
707/// corresponding bytes of the 256-bit result.
708///
709/// \code{.operation}
710/// FOR i := 0 TO 31
711/// j := i*8
712/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
713/// ENDFOR
714/// \endcode
715///
716/// \headerfile <immintrin.h>
717///
718/// This intrinsic corresponds to the \c VPCMPGTB instruction.
719///
720/// \param __a
721/// A 256-bit integer vector containing one of the inputs.
722/// \param __b
723/// A 256-bit integer vector containing one of the inputs.
724/// \returns A 256-bit integer vector containing the result.
725static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
726_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
727{
728 /* This function always performs a signed comparison, but __v32qi is a char
729 which may be signed or unsigned, so use __v32qs. */
730 return (__m256i)((__v32qs)__a > (__v32qs)__b);
731}
732
733/// Compares corresponding signed elements in the 256-bit vectors of
734/// [16 x i16] in \a __a and \a __b for greater-than and returns the
735/// outcomes in the corresponding elements of the 256-bit result.
736///
737/// \code{.operation}
738/// FOR i := 0 TO 15
739/// j := i*16
740/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
741/// ENDFOR
742/// \endcode
743///
744/// \headerfile <immintrin.h>
745///
746/// This intrinsic corresponds to the \c VPCMPGTW instruction.
747///
748/// \param __a
749/// A 256-bit vector of [16 x i16] containing one of the inputs.
750/// \param __b
751/// A 256-bit vector of [16 x i16] containing one of the inputs.
752/// \returns A 256-bit vector of [16 x i16] containing the result.
753static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
754_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
755{
756 return (__m256i)((__v16hi)__a > (__v16hi)__b);
757}
758
759/// Compares corresponding signed elements in the 256-bit vectors of
760/// [8 x i32] in \a __a and \a __b for greater-than and returns the
761/// outcomes in the corresponding elements of the 256-bit result.
762///
763/// \code{.operation}
764/// FOR i := 0 TO 7
765/// j := i*32
766/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
767/// ENDFOR
768/// \endcode
769///
770/// \headerfile <immintrin.h>
771///
772/// This intrinsic corresponds to the \c VPCMPGTD instruction.
773///
774/// \param __a
775/// A 256-bit vector of [8 x i32] containing one of the inputs.
776/// \param __b
777/// A 256-bit vector of [8 x i32] containing one of the inputs.
778/// \returns A 256-bit vector of [8 x i32] containing the result.
779static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
780_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
781{
782 return (__m256i)((__v8si)__a > (__v8si)__b);
783}
784
785/// Compares corresponding signed elements in the 256-bit vectors of
786/// [4 x i64] in \a __a and \a __b for greater-than and returns the
787/// outcomes in the corresponding elements of the 256-bit result.
788///
789/// \code{.operation}
790/// FOR i := 0 TO 3
791/// j := i*64
792/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
793/// ENDFOR
794/// \endcode
795///
796/// \headerfile <immintrin.h>
797///
798/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
799///
800/// \param __a
801/// A 256-bit vector of [4 x i64] containing one of the inputs.
802/// \param __b
803/// A 256-bit vector of [4 x i64] containing one of the inputs.
804/// \returns A 256-bit vector of [4 x i64] containing the result.
805static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
806_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
807{
808 return (__m256i)((__v4di)__a > (__v4di)__b);
809}
810
811/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
812/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
813/// element of the [16 x i16] result (overflow is ignored). Sums from
814/// \a __a are returned in the lower 64 bits of each 128-bit half of the
815/// result; sums from \a __b are returned in the upper 64 bits of each
816/// 128-bit half of the result.
817///
818/// \code{.operation}
819/// FOR i := 0 TO 1
820/// j := i*128
821/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
822/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
823/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
824/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
825/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
826/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
827/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
828/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
829/// ENDFOR
830/// \endcode
831///
832/// \headerfile <immintrin.h>
833///
834/// This intrinsic corresponds to the \c VPHADDW instruction.
835///
836/// \param __a
837/// A 256-bit vector of [16 x i16] containing one of the source operands.
838/// \param __b
839/// A 256-bit vector of [16 x i16] containing one of the source operands.
840/// \returns A 256-bit vector of [16 x i16] containing the sums.
841static __inline__ __m256i __DEFAULT_FN_ATTRS256
842_mm256_hadd_epi16(__m256i __a, __m256i __b)
843{
844 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
845}
846
847/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
848/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
849/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
850/// are returned in the lower 64 bits of each 128-bit half of the result;
851/// sums from \a __b are returned in the upper 64 bits of each 128-bit half
852/// of the result.
853///
854/// \code{.operation}
855/// FOR i := 0 TO 1
856/// j := i*128
857/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
858/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
859/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
860/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
861/// ENDFOR
862/// \endcode
863///
864/// \headerfile <immintrin.h>
865///
866/// This intrinsic corresponds to the \c VPHADDD instruction.
867///
868/// \param __a
869/// A 256-bit vector of [8 x i32] containing one of the source operands.
870/// \param __b
871/// A 256-bit vector of [8 x i32] containing one of the source operands.
872/// \returns A 256-bit vector of [8 x i32] containing the sums.
873static __inline__ __m256i __DEFAULT_FN_ATTRS256
874_mm256_hadd_epi32(__m256i __a, __m256i __b)
875{
876 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
877}
878
879/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
880/// vectors of [16 x i16] using signed saturation and returns each sum in
881/// an element of the [16 x i16] result. Sums from \a __a are returned in
882/// the lower 64 bits of each 128-bit half of the result; sums from \a __b
883/// are returned in the upper 64 bits of each 128-bit half of the result.
884///
885/// \code{.operation}
886/// FOR i := 0 TO 1
887/// j := i*128
888/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
889/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
890/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
891/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
892/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
893/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
894/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
895/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
896/// ENDFOR
897/// \endcode
898///
899/// \headerfile <immintrin.h>
900///
901/// This intrinsic corresponds to the \c VPHADDSW instruction.
902///
903/// \param __a
904/// A 256-bit vector of [16 x i16] containing one of the source operands.
905/// \param __b
906/// A 256-bit vector of [16 x i16] containing one of the source operands.
907/// \returns A 256-bit vector of [16 x i16] containing the sums.
908static __inline__ __m256i __DEFAULT_FN_ATTRS256
909_mm256_hadds_epi16(__m256i __a, __m256i __b)
910{
911 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
912}
913
914/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
915/// vectors of [16 x i16] and returns the lower 16 bits of each difference
916/// in an element of the [16 x i16] result (overflow is ignored).
917/// Differences from \a __a are returned in the lower 64 bits of each
918/// 128-bit half of the result; differences from \a __b are returned in the
919/// upper 64 bits of each 128-bit half of the result.
920///
921/// \code{.operation}
922/// FOR i := 0 TO 1
923/// j := i*128
924/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
925/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
926/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
927/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
928/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
929/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
930/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
931/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
932/// ENDFOR
933/// \endcode
934///
935/// \headerfile <immintrin.h>
936///
937/// This intrinsic corresponds to the \c VPHSUBW instruction.
938///
939/// \param __a
940/// A 256-bit vector of [16 x i16] containing one of the source operands.
941/// \param __b
942/// A 256-bit vector of [16 x i16] containing one of the source operands.
943/// \returns A 256-bit vector of [16 x i16] containing the differences.
944static __inline__ __m256i __DEFAULT_FN_ATTRS256
945_mm256_hsub_epi16(__m256i __a, __m256i __b)
946{
947 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
948}
949
950/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
951/// vectors of [8 x i32] and returns the lower 32 bits of each difference in
952/// an element of the [8 x i32] result (overflow is ignored). Differences
953/// from \a __a are returned in the lower 64 bits of each 128-bit half of
954/// the result; differences from \a __b are returned in the upper 64 bits
955/// of each 128-bit half of the result.
956///
957/// \code{.operation}
958/// FOR i := 0 TO 1
959/// j := i*128
960/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
961/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
962/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
963/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
964/// ENDFOR
965/// \endcode
966///
967/// \headerfile <immintrin.h>
968///
969/// This intrinsic corresponds to the \c VPHSUBD instruction.
970///
971/// \param __a
972/// A 256-bit vector of [8 x i32] containing one of the source operands.
973/// \param __b
974/// A 256-bit vector of [8 x i32] containing one of the source operands.
975/// \returns A 256-bit vector of [8 x i32] containing the differences.
976static __inline__ __m256i __DEFAULT_FN_ATTRS256
977_mm256_hsub_epi32(__m256i __a, __m256i __b)
978{
979 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
980}
981
982/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
983/// vectors of [16 x i16] using signed saturation and returns each sum in
984/// an element of the [16 x i16] result. Differences from \a __a are
985/// returned in the lower 64 bits of each 128-bit half of the result;
986/// differences from \a __b are returned in the upper 64 bits of each
987/// 128-bit half of the result.
988///
989/// \code{.operation}
990/// FOR i := 0 TO 1
991/// j := i*128
992/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
993/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
994/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
995/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
996/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
997/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
998/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
999/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1000/// ENDFOR
1001/// \endcode
1002///
1003/// \headerfile <immintrin.h>
1004///
1005/// This intrinsic corresponds to the \c VPHSUBSW instruction.
1006///
1007/// \param __a
1008/// A 256-bit vector of [16 x i16] containing one of the source operands.
1009/// \param __b
1010/// A 256-bit vector of [16 x i16] containing one of the source operands.
1011/// \returns A 256-bit vector of [16 x i16] containing the differences.
1012static __inline__ __m256i __DEFAULT_FN_ATTRS256
1013_mm256_hsubs_epi16(__m256i __a, __m256i __b)
1014{
1015 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1016}
1017
1018/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1019/// with the corresponding signed byte from the 256-bit integer vector in
1020/// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1021/// pairs of those products using signed saturation to form 16-bit sums
1022/// returned as elements of the [16 x i16] result.
1023///
1024/// \code{.operation}
1025/// FOR i := 0 TO 15
1026/// j := i*16
1027/// temp1 := __a[j+7:j] * __b[j+7:j]
1028/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1029/// result[j+15:j] := SATURATE16(temp1 + temp2)
1030/// ENDFOR
1031/// \endcode
1032///
1033/// \headerfile <immintrin.h>
1034///
1035/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1036///
1037/// \param __a
1038/// A 256-bit vector containing one of the source operands.
1039/// \param __b
1040/// A 256-bit vector containing one of the source operands.
1041/// \returns A 256-bit vector of [16 x i16] containing the result.
1042static __inline__ __m256i __DEFAULT_FN_ATTRS256
1044{
1045 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1046}
1047
1048/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1049/// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1050/// those products to form 32-bit sums returned as elements of the
1051/// [8 x i32] result.
1052///
1053/// There is only one wraparound case: when all four of the 16-bit sources
1054/// are \c 0x8000, the result will be \c 0x80000000.
1055///
1056/// \code{.operation}
1057/// FOR i := 0 TO 7
1058/// j := i*32
1059/// temp1 := __a[j+15:j] * __b[j+15:j]
1060/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1061/// result[j+31:j] := temp1 + temp2
1062/// ENDFOR
1063/// \endcode
1064///
1065/// \headerfile <immintrin.h>
1066///
1067/// This intrinsic corresponds to the \c VPMADDWD instruction.
1068///
1069/// \param __a
1070/// A 256-bit vector of [16 x i16] containing one of the source operands.
1071/// \param __b
1072/// A 256-bit vector of [16 x i16] containing one of the source operands.
1073/// \returns A 256-bit vector of [8 x i32] containing the result.
1074static __inline__ __m256i __DEFAULT_FN_ATTRS256
1075_mm256_madd_epi16(__m256i __a, __m256i __b)
1076{
1077 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1078}
1079
1080/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1081/// in \a __a and \a __b and returns the larger of each pair in the
1082/// corresponding byte of the 256-bit result.
1083///
1084/// \headerfile <immintrin.h>
1085///
1086/// This intrinsic corresponds to the \c VPMAXSB instruction.
1087///
1088/// \param __a
1089/// A 256-bit integer vector.
1090/// \param __b
1091/// A 256-bit integer vector.
1092/// \returns A 256-bit integer vector containing the result.
1093static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1094_mm256_max_epi8(__m256i __a, __m256i __b) {
1095 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1096}
1097
1098/// Compares the corresponding signed 16-bit integers in the two 256-bit
1099/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1100/// each pair in the corresponding element of the 256-bit result.
1101///
1102/// \headerfile <immintrin.h>
1103///
1104/// This intrinsic corresponds to the \c VPMAXSW instruction.
1105///
1106/// \param __a
1107/// A 256-bit vector of [16 x i16].
1108/// \param __b
1109/// A 256-bit vector of [16 x i16].
1110/// \returns A 256-bit vector of [16 x i16] containing the result.
1111static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1112_mm256_max_epi16(__m256i __a, __m256i __b) {
1113 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1114}
1115
1116/// Compares the corresponding signed 32-bit integers in the two 256-bit
1117/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1118/// each pair in the corresponding element of the 256-bit result.
1119///
1120/// \headerfile <immintrin.h>
1121///
1122/// This intrinsic corresponds to the \c VPMAXSD instruction.
1123///
1124/// \param __a
1125/// A 256-bit vector of [8 x i32].
1126/// \param __b
1127/// A 256-bit vector of [8 x i32].
1128/// \returns A 256-bit vector of [8 x i32] containing the result.
1129static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1130_mm256_max_epi32(__m256i __a, __m256i __b) {
1131 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1132}
1133
1134/// Compares the corresponding unsigned bytes in the two 256-bit integer
1135/// vectors in \a __a and \a __b and returns the larger of each pair in
1136/// the corresponding byte of the 256-bit result.
1137///
1138/// \headerfile <immintrin.h>
1139///
1140/// This intrinsic corresponds to the \c VPMAXUB instruction.
1141///
1142/// \param __a
1143/// A 256-bit integer vector.
1144/// \param __b
1145/// A 256-bit integer vector.
1146/// \returns A 256-bit integer vector containing the result.
1147static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1148_mm256_max_epu8(__m256i __a, __m256i __b) {
1149 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1150}
1151
1152/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1153/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1154/// each pair in the corresponding element of the 256-bit result.
1155///
1156/// \headerfile <immintrin.h>
1157///
1158/// This intrinsic corresponds to the \c VPMAXUW instruction.
1159///
1160/// \param __a
1161/// A 256-bit vector of [16 x i16].
1162/// \param __b
1163/// A 256-bit vector of [16 x i16].
1164/// \returns A 256-bit vector of [16 x i16] containing the result.
1165static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1166_mm256_max_epu16(__m256i __a, __m256i __b) {
1167 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1168}
1169
1170/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1171/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1172/// each pair in the corresponding element of the 256-bit result.
1173///
1174/// \headerfile <immintrin.h>
1175///
1176/// This intrinsic corresponds to the \c VPMAXUD instruction.
1177///
1178/// \param __a
1179/// A 256-bit vector of [8 x i32].
1180/// \param __b
1181/// A 256-bit vector of [8 x i32].
1182/// \returns A 256-bit vector of [8 x i32] containing the result.
1183static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1184_mm256_max_epu32(__m256i __a, __m256i __b) {
1185 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1186}
1187
1188/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1189/// in \a __a and \a __b and returns the smaller of each pair in the
1190/// corresponding byte of the 256-bit result.
1191///
1192/// \headerfile <immintrin.h>
1193///
1194/// This intrinsic corresponds to the \c VPMINSB instruction.
1195///
1196/// \param __a
1197/// A 256-bit integer vector.
1198/// \param __b
1199/// A 256-bit integer vector.
1200/// \returns A 256-bit integer vector containing the result.
1201static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1202_mm256_min_epi8(__m256i __a, __m256i __b) {
1203 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1204}
1205
1206/// Compares the corresponding signed 16-bit integers in the two 256-bit
1207/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1208/// each pair in the corresponding element of the 256-bit result.
1209///
1210/// \headerfile <immintrin.h>
1211///
1212/// This intrinsic corresponds to the \c VPMINSW instruction.
1213///
1214/// \param __a
1215/// A 256-bit vector of [16 x i16].
1216/// \param __b
1217/// A 256-bit vector of [16 x i16].
1218/// \returns A 256-bit vector of [16 x i16] containing the result.
1219static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1220_mm256_min_epi16(__m256i __a, __m256i __b) {
1221 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1222}
1223
1224/// Compares the corresponding signed 32-bit integers in the two 256-bit
1225/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1226/// each pair in the corresponding element of the 256-bit result.
1227///
1228/// \headerfile <immintrin.h>
1229///
1230/// This intrinsic corresponds to the \c VPMINSD instruction.
1231///
1232/// \param __a
1233/// A 256-bit vector of [8 x i32].
1234/// \param __b
1235/// A 256-bit vector of [8 x i32].
1236/// \returns A 256-bit vector of [8 x i32] containing the result.
1237static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1238_mm256_min_epi32(__m256i __a, __m256i __b) {
1239 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1240}
1241
1242/// Compares the corresponding unsigned bytes in the two 256-bit integer
1243/// vectors in \a __a and \a __b and returns the smaller of each pair in
1244/// the corresponding byte of the 256-bit result.
1245///
1246/// \headerfile <immintrin.h>
1247///
1248/// This intrinsic corresponds to the \c VPMINUB instruction.
1249///
1250/// \param __a
1251/// A 256-bit integer vector.
1252/// \param __b
1253/// A 256-bit integer vector.
1254/// \returns A 256-bit integer vector containing the result.
1255static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1256_mm256_min_epu8(__m256i __a, __m256i __b) {
1257 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1258}
1259
1260/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1261/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1262/// each pair in the corresponding element of the 256-bit result.
1263///
1264/// \headerfile <immintrin.h>
1265///
1266/// This intrinsic corresponds to the \c VPMINUW instruction.
1267///
1268/// \param __a
1269/// A 256-bit vector of [16 x i16].
1270/// \param __b
1271/// A 256-bit vector of [16 x i16].
1272/// \returns A 256-bit vector of [16 x i16] containing the result.
1273static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1274_mm256_min_epu16(__m256i __a, __m256i __b) {
1275 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1276}
1277
1278/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1279/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1280/// each pair in the corresponding element of the 256-bit result.
1281///
1282/// \headerfile <immintrin.h>
1283///
1284/// This intrinsic corresponds to the \c VPMINUD instruction.
1285///
1286/// \param __a
1287/// A 256-bit vector of [8 x i32].
1288/// \param __b
1289/// A 256-bit vector of [8 x i32].
1290/// \returns A 256-bit vector of [8 x i32] containing the result.
1291static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1292_mm256_min_epu32(__m256i __a, __m256i __b) {
1293 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1294}
1295
1296/// Creates a 32-bit integer mask from the most significant bit of each byte
1297/// in the 256-bit integer vector in \a __a and returns the result.
1298///
1299/// \code{.operation}
1300/// FOR i := 0 TO 31
1301/// j := i*8
1302/// result[i] := __a[j+7]
1303/// ENDFOR
1304/// \endcode
1305///
1306/// \headerfile <immintrin.h>
1307///
1308/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1309///
1310/// \param __a
1311/// A 256-bit integer vector containing the source bytes.
1312/// \returns The 32-bit integer mask.
1313static __inline__ int __DEFAULT_FN_ATTRS256
1315{
1316 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1317}
1318
1319/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1320/// the 16-bit values in the corresponding elements of a 256-bit vector
1321/// of [16 x i16].
1322///
1323/// \code{.operation}
1324/// FOR i := 0 TO 15
1325/// j := i*8
1326/// k := i*16
1327/// result[k+15:k] := SignExtend(__V[j+7:j])
1328/// ENDFOR
1329/// \endcode
1330///
1331/// \headerfile <immintrin.h>
1332///
1333/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1334///
1335/// \param __V
1336/// A 128-bit integer vector containing the source bytes.
1337/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1338/// values.
1339static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1341 /* This function always performs a signed extension, but __v16qi is a char
1342 which may be signed or unsigned, so use __v16qs. */
1343 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1344}
1345
1346/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1347/// \a __V and returns the 32-bit values in the corresponding elements of a
1348/// 256-bit vector of [8 x i32].
1349///
1350/// \code{.operation}
1351/// FOR i := 0 TO 7
1352/// j := i*8
1353/// k := i*32
1354/// result[k+31:k] := SignExtend(__V[j+7:j])
1355/// ENDFOR
1356/// \endcode
1357///
1358/// \headerfile <immintrin.h>
1359///
1360/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1361///
1362/// \param __V
1363/// A 128-bit integer vector containing the source bytes.
1364/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1365/// values.
1366static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1368 /* This function always performs a signed extension, but __v16qi is a char
1369 which may be signed or unsigned, so use __v16qs. */
1370 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1371}
1372
1373/// Sign-extends the first four bytes from the 128-bit integer vector in
1374/// \a __V and returns the 64-bit values in the corresponding elements of a
1375/// 256-bit vector of [4 x i64].
1376///
1377/// \code{.operation}
1378/// result[63:0] := SignExtend(__V[7:0])
1379/// result[127:64] := SignExtend(__V[15:8])
1380/// result[191:128] := SignExtend(__V[23:16])
1381/// result[255:192] := SignExtend(__V[31:24])
1382/// \endcode
1383///
1384/// \headerfile <immintrin.h>
1385///
1386/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1387///
1388/// \param __V
1389/// A 128-bit integer vector containing the source bytes.
1390/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1391/// values.
1392static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1394 /* This function always performs a signed extension, but __v16qi is a char
1395 which may be signed or unsigned, so use __v16qs. */
1396 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1397}
1398
1399/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1400/// \a __V and returns the 32-bit values in the corresponding elements of a
1401/// 256-bit vector of [8 x i32].
1402///
1403/// \code{.operation}
1404/// FOR i := 0 TO 7
1405/// j := i*16
1406/// k := i*32
1407/// result[k+31:k] := SignExtend(__V[j+15:j])
1408/// ENDFOR
1409/// \endcode
1410///
1411/// \headerfile <immintrin.h>
1412///
1413/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1414///
1415/// \param __V
1416/// A 128-bit vector of [8 x i16] containing the source values.
1417/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1418/// values.
1419static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1421 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1422}
1423
1424/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1425/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1426/// elements of a 256-bit vector of [4 x i64].
1427///
1428/// \code{.operation}
1429/// result[63:0] := SignExtend(__V[15:0])
1430/// result[127:64] := SignExtend(__V[31:16])
1431/// result[191:128] := SignExtend(__V[47:32])
1432/// result[255:192] := SignExtend(__V[64:48])
1433/// \endcode
1434///
1435/// \headerfile <immintrin.h>
1436///
1437/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1438///
1439/// \param __V
1440/// A 128-bit vector of [8 x i16] containing the source values.
1441/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1442/// values.
1443static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1445 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1446}
1447
1448/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1449/// \a __V and returns the 64-bit values in the corresponding elements of a
1450/// 256-bit vector of [4 x i64].
1451///
1452/// \code{.operation}
1453/// result[63:0] := SignExtend(__V[31:0])
1454/// result[127:64] := SignExtend(__V[63:32])
1455/// result[191:128] := SignExtend(__V[95:64])
1456/// result[255:192] := SignExtend(__V[127:96])
1457/// \endcode
1458///
1459/// \headerfile <immintrin.h>
1460///
1461/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1462///
1463/// \param __V
1464/// A 128-bit vector of [4 x i32] containing the source values.
1465/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1466/// values.
1467static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1469 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1470}
1471
1472/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1473/// the 16-bit values in the corresponding elements of a 256-bit vector
1474/// of [16 x i16].
1475///
1476/// \code{.operation}
1477/// FOR i := 0 TO 15
1478/// j := i*8
1479/// k := i*16
1480/// result[k+15:k] := ZeroExtend(__V[j+7:j])
1481/// ENDFOR
1482/// \endcode
1483///
1484/// \headerfile <immintrin.h>
1485///
1486/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1487///
1488/// \param __V
1489/// A 128-bit integer vector containing the source bytes.
1490/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1491/// values.
1492static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1494 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1495}
1496
1497/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1498/// \a __V and returns the 32-bit values in the corresponding elements of a
1499/// 256-bit vector of [8 x i32].
1500///
1501/// \code{.operation}
1502/// FOR i := 0 TO 7
1503/// j := i*8
1504/// k := i*32
1505/// result[k+31:k] := ZeroExtend(__V[j+7:j])
1506/// ENDFOR
1507/// \endcode
1508///
1509/// \headerfile <immintrin.h>
1510///
1511/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1512///
1513/// \param __V
1514/// A 128-bit integer vector containing the source bytes.
1515/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1516/// values.
1517static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1519 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1520}
1521
1522/// Zero-extends the first four bytes from the 128-bit integer vector in
1523/// \a __V and returns the 64-bit values in the corresponding elements of a
1524/// 256-bit vector of [4 x i64].
1525///
1526/// \code{.operation}
1527/// result[63:0] := ZeroExtend(__V[7:0])
1528/// result[127:64] := ZeroExtend(__V[15:8])
1529/// result[191:128] := ZeroExtend(__V[23:16])
1530/// result[255:192] := ZeroExtend(__V[31:24])
1531/// \endcode
1532///
1533/// \headerfile <immintrin.h>
1534///
1535/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1536///
1537/// \param __V
1538/// A 128-bit integer vector containing the source bytes.
1539/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1540/// values.
1541static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1543 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1544}
1545
1546/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1547/// \a __V and returns the 32-bit values in the corresponding elements of a
1548/// 256-bit vector of [8 x i32].
1549///
1550/// \code{.operation}
1551/// FOR i := 0 TO 7
1552/// j := i*16
1553/// k := i*32
1554/// result[k+31:k] := ZeroExtend(__V[j+15:j])
1555/// ENDFOR
1556/// \endcode
1557///
1558/// \headerfile <immintrin.h>
1559///
1560/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1561///
1562/// \param __V
1563/// A 128-bit vector of [8 x i16] containing the source values.
1564/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1565/// values.
1566static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1568 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1569}
1570
1571/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1572/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1573/// elements of a 256-bit vector of [4 x i64].
1574///
1575/// \code{.operation}
1576/// result[63:0] := ZeroExtend(__V[15:0])
1577/// result[127:64] := ZeroExtend(__V[31:16])
1578/// result[191:128] := ZeroExtend(__V[47:32])
1579/// result[255:192] := ZeroExtend(__V[64:48])
1580/// \endcode
1581///
1582/// \headerfile <immintrin.h>
1583///
1584/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1585///
1586/// \param __V
1587/// A 128-bit vector of [8 x i16] containing the source values.
1588/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1589/// values.
1590static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1592 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1593}
1594
1595/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1596/// \a __V and returns the 64-bit values in the corresponding elements of a
1597/// 256-bit vector of [4 x i64].
1598///
1599/// \code{.operation}
1600/// result[63:0] := ZeroExtend(__V[31:0])
1601/// result[127:64] := ZeroExtend(__V[63:32])
1602/// result[191:128] := ZeroExtend(__V[95:64])
1603/// result[255:192] := ZeroExtend(__V[127:96])
1604/// \endcode
1605///
1606/// \headerfile <immintrin.h>
1607///
1608/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1609///
1610/// \param __V
1611/// A 128-bit vector of [4 x i32] containing the source values.
1612/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1613/// values.
1614static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1616 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1617}
1618
1619/// Multiplies signed 32-bit integers from even-numbered elements of two
1620/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1621/// [4 x i64] result.
1622///
1623/// \code{.operation}
1624/// result[63:0] := __a[31:0] * __b[31:0]
1625/// result[127:64] := __a[95:64] * __b[95:64]
1626/// result[191:128] := __a[159:128] * __b[159:128]
1627/// result[255:192] := __a[223:192] * __b[223:192]
1628/// \endcode
1629///
1630/// \headerfile <immintrin.h>
1631///
1632/// This intrinsic corresponds to the \c VPMULDQ instruction.
1633///
1634/// \param __a
1635/// A 256-bit vector of [8 x i32] containing one of the source operands.
1636/// \param __b
1637/// A 256-bit vector of [8 x i32] containing one of the source operands.
1638/// \returns A 256-bit vector of [4 x i64] containing the products.
1639static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1640_mm256_mul_epi32(__m256i __a, __m256i __b) {
1641 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1642}
1643
1644/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1645/// [16 x i16], truncates the 32-bit results to the most significant 18
1646/// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1647/// product in the [16 x i16] result.
1648///
1649/// \code{.operation}
1650/// FOR i := 0 TO 15
1651/// j := i*16
1652/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1653/// result[j+15:j] := temp[16:1]
1654/// \endcode
1655///
1656/// \headerfile <immintrin.h>
1657///
1658/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1659///
1660/// \param __a
1661/// A 256-bit vector of [16 x i16] containing one of the source operands.
1662/// \param __b
1663/// A 256-bit vector of [16 x i16] containing one of the source operands.
1664/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1665static __inline__ __m256i __DEFAULT_FN_ATTRS256
1666_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1667{
1668 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1669}
1670
1671/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1672/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1673/// [16 x i16] result.
1674///
1675/// \headerfile <immintrin.h>
1676///
1677/// This intrinsic corresponds to the \c VPMULHUW instruction.
1678///
1679/// \param __a
1680/// A 256-bit vector of [16 x i16] containing one of the source operands.
1681/// \param __b
1682/// A 256-bit vector of [16 x i16] containing one of the source operands.
1683/// \returns A 256-bit vector of [16 x i16] containing the products.
1684static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1685_mm256_mulhi_epu16(__m256i __a, __m256i __b)
1686{
1687 return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
1688}
1689
1690/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1691/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1692/// [16 x i16] result.
1693///
1694/// \headerfile <immintrin.h>
1695///
1696/// This intrinsic corresponds to the \c VPMULHW instruction.
1697///
1698/// \param __a
1699/// A 256-bit vector of [16 x i16] containing one of the source operands.
1700/// \param __b
1701/// A 256-bit vector of [16 x i16] containing one of the source operands.
1702/// \returns A 256-bit vector of [16 x i16] containing the products.
1703static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1704_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1705{
1706 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1707}
1708
1709/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1710/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1711/// [16 x i16] result.
1712///
1713/// \headerfile <immintrin.h>
1714///
1715/// This intrinsic corresponds to the \c VPMULLW instruction.
1716///
1717/// \param __a
1718/// A 256-bit vector of [16 x i16] containing one of the source operands.
1719/// \param __b
1720/// A 256-bit vector of [16 x i16] containing one of the source operands.
1721/// \returns A 256-bit vector of [16 x i16] containing the products.
1722static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1723_mm256_mullo_epi16(__m256i __a, __m256i __b)
1724{
1725 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1726}
1727
1728/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1729/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1730/// [8 x i32] result.
1731///
1732/// \headerfile <immintrin.h>
1733///
1734/// This intrinsic corresponds to the \c VPMULLD instruction.
1735///
1736/// \param __a
1737/// A 256-bit vector of [8 x i32] containing one of the source operands.
1738/// \param __b
1739/// A 256-bit vector of [8 x i32] containing one of the source operands.
1740/// \returns A 256-bit vector of [8 x i32] containing the products.
1741static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1742_mm256_mullo_epi32(__m256i __a, __m256i __b) {
1743 return (__m256i)((__v8su)__a * (__v8su)__b);
1744}
1745
1746/// Multiplies unsigned 32-bit integers from even-numered elements of two
1747/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1748/// [4 x i64] result.
1749///
1750/// \code{.operation}
1751/// result[63:0] := __a[31:0] * __b[31:0]
1752/// result[127:64] := __a[95:64] * __b[95:64]
1753/// result[191:128] := __a[159:128] * __b[159:128]
1754/// result[255:192] := __a[223:192] * __b[223:192]
1755/// \endcode
1756///
1757/// \headerfile <immintrin.h>
1758///
1759/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1760///
1761/// \param __a
1762/// A 256-bit vector of [8 x i32] containing one of the source operands.
1763/// \param __b
1764/// A 256-bit vector of [8 x i32] containing one of the source operands.
1765/// \returns A 256-bit vector of [4 x i64] containing the products.
1766static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1767_mm256_mul_epu32(__m256i __a, __m256i __b) {
1768 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1769}
1770
1771/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1772/// \a __b.
1773///
1774/// \headerfile <immintrin.h>
1775///
1776/// This intrinsic corresponds to the \c VPOR instruction.
1777///
1778/// \param __a
1779/// A 256-bit integer vector.
1780/// \param __b
1781/// A 256-bit integer vector.
1782/// \returns A 256-bit integer vector containing the result.
1783static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
1784_mm256_or_si256(__m256i __a, __m256i __b)
1785{
1786 return (__m256i)((__v4du)__a | (__v4du)__b);
1787}
1788
1789/// Computes four sum of absolute difference (SAD) operations on sets of eight
1790/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1791/// \a __b.
1792///
1793/// One SAD result is computed for each set of eight bytes from \a __a and
1794/// eight bytes from \a __b. The zero-extended SAD value is returned in the
1795/// corresponding 64-bit element of the result.
1796///
1797/// A single SAD operation takes the differences between the corresponding
1798/// bytes of \a __a and \a __b, takes the absolute value of each difference,
1799/// and sums these eight values to form one 16-bit result. This operation
1800/// is repeated four times with successive sets of eight bytes.
1801///
1802/// \code{.operation}
1803/// FOR i := 0 TO 3
1804/// j := i*64
1805/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1806/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1807/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1808/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1809/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1810/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1811/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1812/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1813/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1814/// temp4 + temp5 + temp6 + temp7
1815/// result[j+63:j+16] := 0
1816/// ENDFOR
1817/// \endcode
1818///
1819/// \headerfile <immintrin.h>
1820///
1821/// This intrinsic corresponds to the \c VPSADBW instruction.
1822///
1823/// \param __a
1824/// A 256-bit integer vector.
1825/// \param __b
1826/// A 256-bit integer vector.
1827/// \returns A 256-bit integer vector containing the result.
1828static __inline__ __m256i __DEFAULT_FN_ATTRS256
1829_mm256_sad_epu8(__m256i __a, __m256i __b)
1830{
1831 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1832}
1833
1834/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1835/// to control information in the 256-bit integer vector \a __b, and
1836/// returns the 256-bit result. In effect there are two separate 128-bit
1837/// shuffles in the lower and upper halves.
1838///
1839/// \code{.operation}
1840/// FOR i := 0 TO 31
1841/// j := i*8
1842/// IF __b[j+7] == 1
1843/// result[j+7:j] := 0
1844/// ELSE
1845/// k := __b[j+3:j] * 8
1846/// IF i > 15
1847/// k := k + 128
1848/// FI
1849/// result[j+7:j] := __a[k+7:k]
1850/// FI
1851/// ENDFOR
1852/// \endcode
1853///
1854/// \headerfile <immintrin.h>
1855///
1856/// This intrinsic corresponds to the \c VPSHUFB instruction.
1857///
1858/// \param __a
1859/// A 256-bit integer vector containing source values.
1860/// \param __b
1861/// A 256-bit integer vector containing control information to determine
1862/// what goes into the corresponding byte of the result. If bit 7 of the
1863/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1864/// control byte specify the index (within the same 128-bit half) of \a __a
1865/// to copy to the result byte.
1866/// \returns A 256-bit integer vector containing the result.
1867static __inline__ __m256i __DEFAULT_FN_ATTRS256
1868_mm256_shuffle_epi8(__m256i __a, __m256i __b)
1869{
1870 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1871}
1872
1873/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1874/// according to control information in the integer literal \a imm, and
1875/// returns the 256-bit result. In effect there are two parallel 128-bit
1876/// shuffles in the lower and upper halves.
1877///
1878/// \code{.operation}
1879/// FOR i := 0 to 3
1880/// j := i*32
1881/// k := (imm >> i*2)[1:0] * 32
1882/// result[j+31:j] := a[k+31:k]
1883/// result[128+j+31:128+j] := a[128+k+31:128+k]
1884/// ENDFOR
1885/// \endcode
1886///
1887/// \headerfile <immintrin.h>
1888///
1889/// \code
1890/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1891/// \endcode
1892///
1893/// This intrinsic corresponds to the \c VPSHUFB instruction.
1894///
1895/// \param a
1896/// A 256-bit vector of [8 x i32] containing source values.
1897/// \param imm
1898/// An immediate 8-bit value specifying which elements to copy from \a a.
1899/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1900/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1901/// forth.
1902/// \returns A 256-bit vector of [8 x i32] containing the result.
1903#define _mm256_shuffle_epi32(a, imm) \
1904 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1905
1906/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1907/// according to control information in the integer literal \a imm, and
1908/// returns the 256-bit result. The upper 64 bits of each 128-bit half
1909/// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1910/// copied from \a a unchanged.
1911///
1912/// \code{.operation}
1913/// result[63:0] := a[63:0]
1914/// result[191:128] := a[191:128]
1915/// FOR i := 0 TO 3
1916/// j := i * 16 + 64
1917/// k := (imm >> i*2)[1:0] * 16 + 64
1918/// result[j+15:j] := a[k+15:k]
1919/// result[128+j+15:128+j] := a[128+k+15:128+k]
1920/// ENDFOR
1921/// \endcode
1922///
1923/// \headerfile <immintrin.h>
1924///
1925/// \code
1926/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1927/// \endcode
1928///
1929/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1930///
1931/// \param a
1932/// A 256-bit vector of [16 x i16] containing source values.
1933/// \param imm
1934/// An immediate 8-bit value specifying which elements to copy from \a a.
1935/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1936/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1937/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1938/// \returns A 256-bit vector of [16 x i16] containing the result.
1939#define _mm256_shufflehi_epi16(a, imm) \
1940 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1941
1942/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1943/// according to control information in the integer literal \a imm, and
1944/// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1945/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1946/// copied from \a a unchanged.
1947///
1948/// \code{.operation}
1949/// result[127:64] := a[127:64]
1950/// result[255:192] := a[255:192]
1951/// FOR i := 0 TO 3
1952/// j := i * 16
1953/// k := (imm >> i*2)[1:0] * 16
1954/// result[j+15:j] := a[k+15:k]
1955/// result[128+j+15:128+j] := a[128+k+15:128+k]
1956/// ENDFOR
1957/// \endcode
1958///
1959/// \headerfile <immintrin.h>
1960///
1961/// \code
1962/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1963/// \endcode
1964///
1965/// This intrinsic corresponds to the \c VPSHUFLW instruction.
1966///
1967/// \param a
1968/// A 256-bit vector of [16 x i16] to use as a source of data for the
1969/// result.
1970/// \param imm
1971/// An immediate 8-bit value specifying which elements to copy from \a a.
1972/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
1973/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
1974/// forth.
1975/// \returns A 256-bit vector of [16 x i16] containing the result.
1976#define _mm256_shufflelo_epi16(a, imm) \
1977 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
1978
1979/// Sets each byte of the result to the corresponding byte of the 256-bit
1980/// integer vector in \a __a, the negative of that byte, or zero, depending
1981/// on whether the corresponding byte of the 256-bit integer vector in
1982/// \a __b is greater than zero, less than zero, or equal to zero,
1983/// respectively.
1984///
1985/// \headerfile <immintrin.h>
1986///
1987/// This intrinsic corresponds to the \c VPSIGNB instruction.
1988///
1989/// \param __a
1990/// A 256-bit integer vector.
1991/// \param __b
1992/// A 256-bit integer vector].
1993/// \returns A 256-bit integer vector containing the result.
1994static __inline__ __m256i __DEFAULT_FN_ATTRS256
1995_mm256_sign_epi8(__m256i __a, __m256i __b)
1996{
1997 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
1998}
1999
2000/// Sets each element of the result to the corresponding element of the
2001/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2002/// or zero, depending on whether the corresponding element of the 256-bit
2003/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2004/// equal to zero, respectively.
2005///
2006/// \headerfile <immintrin.h>
2007///
2008/// This intrinsic corresponds to the \c VPSIGNW instruction.
2009///
2010/// \param __a
2011/// A 256-bit vector of [16 x i16].
2012/// \param __b
2013/// A 256-bit vector of [16 x i16].
2014/// \returns A 256-bit vector of [16 x i16] containing the result.
2015static __inline__ __m256i __DEFAULT_FN_ATTRS256
2016_mm256_sign_epi16(__m256i __a, __m256i __b)
2017{
2018 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2019}
2020
2021/// Sets each element of the result to the corresponding element of the
2022/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2023/// zero, depending on whether the corresponding element of the 256-bit
2024/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2025/// equal to zero, respectively.
2026///
2027/// \headerfile <immintrin.h>
2028///
2029/// This intrinsic corresponds to the \c VPSIGND instruction.
2030///
2031/// \param __a
2032/// A 256-bit vector of [8 x i32].
2033/// \param __b
2034/// A 256-bit vector of [8 x i32].
2035/// \returns A 256-bit vector of [8 x i32] containing the result.
2036static __inline__ __m256i __DEFAULT_FN_ATTRS256
2037_mm256_sign_epi32(__m256i __a, __m256i __b)
2038{
2039 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2040}
2041
2042/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2043/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2044/// is greater than 15, the returned result is all zeroes.
2045///
2046/// \headerfile <immintrin.h>
2047///
2048/// \code
2049/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2050/// \endcode
2051///
2052/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2053///
2054/// \param a
2055/// A 256-bit integer vector to be shifted.
2056/// \param imm
2057/// An unsigned immediate value specifying the shift count (in bytes).
2058/// \returns A 256-bit integer vector containing the result.
2059#define _mm256_slli_si256(a, imm) \
2060 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2061 (int)(imm)))
2062
2063/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2064/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2065/// is greater than 15, the returned result is all zeroes.
2066///
2067/// \headerfile <immintrin.h>
2068///
2069/// \code
2070/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2071/// \endcode
2072///
2073/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2074///
2075/// \param a
2076/// A 256-bit integer vector to be shifted.
2077/// \param imm
2078/// An unsigned immediate value specifying the shift count (in bytes).
2079/// \returns A 256-bit integer vector containing the result.
2080#define _mm256_bslli_epi128(a, imm) \
2081 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2082 (int)(imm)))
2083
2084/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2085/// left by \a __count bits, shifting in zero bits, and returns the result.
2086/// If \a __count is greater than 15, the returned result is all zeroes.
2087///
2088/// \headerfile <immintrin.h>
2089///
2090/// This intrinsic corresponds to the \c VPSLLW instruction.
2091///
2092/// \param __a
2093/// A 256-bit vector of [16 x i16] to be shifted.
2094/// \param __count
2095/// An unsigned integer value specifying the shift count (in bits).
2096/// \returns A 256-bit vector of [16 x i16] containing the result.
2097static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2098_mm256_slli_epi16(__m256i __a, int __count) {
2099 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2100}
2101
2102/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2103/// left by the number of bits specified by the lower 64 bits of \a __count,
2104/// shifting in zero bits, and returns the result. If \a __count is greater
2105/// than 15, the returned result is all zeroes.
2106///
2107/// \headerfile <immintrin.h>
2108///
2109/// This intrinsic corresponds to the \c VPSLLW instruction.
2110///
2111/// \param __a
2112/// A 256-bit vector of [16 x i16] to be shifted.
2113/// \param __count
2114/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2115/// shift count (in bits). The upper element is ignored.
2116/// \returns A 256-bit vector of [16 x i16] containing the result.
2117static __inline__ __m256i __DEFAULT_FN_ATTRS256
2118_mm256_sll_epi16(__m256i __a, __m128i __count)
2119{
2120 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2121}
2122
2123/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2124/// left by \a __count bits, shifting in zero bits, and returns the result.
2125/// If \a __count is greater than 31, the returned result is all zeroes.
2126///
2127/// \headerfile <immintrin.h>
2128///
2129/// This intrinsic corresponds to the \c VPSLLD instruction.
2130///
2131/// \param __a
2132/// A 256-bit vector of [8 x i32] to be shifted.
2133/// \param __count
2134/// An unsigned integer value specifying the shift count (in bits).
2135/// \returns A 256-bit vector of [8 x i32] containing the result.
2136static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2137_mm256_slli_epi32(__m256i __a, int __count) {
2138 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2139}
2140
2141/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2142/// left by the number of bits given in the lower 64 bits of \a __count,
2143/// shifting in zero bits, and returns the result. If \a __count is greater
2144/// than 31, the returned result is all zeroes.
2145///
2146/// \headerfile <immintrin.h>
2147///
2148/// This intrinsic corresponds to the \c VPSLLD instruction.
2149///
2150/// \param __a
2151/// A 256-bit vector of [8 x i32] to be shifted.
2152/// \param __count
2153/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2154/// shift count (in bits). The upper element is ignored.
2155/// \returns A 256-bit vector of [8 x i32] containing the result.
2156static __inline__ __m256i __DEFAULT_FN_ATTRS256
2157_mm256_sll_epi32(__m256i __a, __m128i __count)
2158{
2159 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2160}
2161
2162/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2163/// left by \a __count bits, shifting in zero bits, and returns the result.
2164/// If \a __count is greater than 63, the returned result is all zeroes.
2165///
2166/// \headerfile <immintrin.h>
2167///
2168/// This intrinsic corresponds to the \c VPSLLQ instruction.
2169///
2170/// \param __a
2171/// A 256-bit vector of [4 x i64] to be shifted.
2172/// \param __count
2173/// An unsigned integer value specifying the shift count (in bits).
2174/// \returns A 256-bit vector of [4 x i64] containing the result.
2175static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2176_mm256_slli_epi64(__m256i __a, int __count) {
2177 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2178}
2179
2180/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2181/// left by the number of bits given in the lower 64 bits of \a __count,
2182/// shifting in zero bits, and returns the result. If \a __count is greater
2183/// than 63, the returned result is all zeroes.
2184///
2185/// \headerfile <immintrin.h>
2186///
2187/// This intrinsic corresponds to the \c VPSLLQ instruction.
2188///
2189/// \param __a
2190/// A 256-bit vector of [4 x i64] to be shifted.
2191/// \param __count
2192/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2193/// shift count (in bits). The upper element is ignored.
2194/// \returns A 256-bit vector of [4 x i64] containing the result.
2195static __inline__ __m256i __DEFAULT_FN_ATTRS256
2196_mm256_sll_epi64(__m256i __a, __m128i __count)
2197{
2198 return __builtin_ia32_psllq256((__v4di)__a, __count);
2199}
2200
2201/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2202/// right by \a __count bits, shifting in sign bits, and returns the result.
2203/// If \a __count is greater than 15, each element of the result is either
2204/// 0 or -1 according to the corresponding input sign bit.
2205///
2206/// \headerfile <immintrin.h>
2207///
2208/// This intrinsic corresponds to the \c VPSRAW instruction.
2209///
2210/// \param __a
2211/// A 256-bit vector of [16 x i16] to be shifted.
2212/// \param __count
2213/// An unsigned integer value specifying the shift count (in bits).
2214/// \returns A 256-bit vector of [16 x i16] containing the result.
2215static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2216_mm256_srai_epi16(__m256i __a, int __count) {
2217 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2218}
2219
2220/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2221/// right by the number of bits given in the lower 64 bits of \a __count,
2222/// shifting in sign bits, and returns the result. If \a __count is greater
2223/// than 15, each element of the result is either 0 or -1 according to the
2224/// corresponding input sign bit.
2225///
2226/// \headerfile <immintrin.h>
2227///
2228/// This intrinsic corresponds to the \c VPSRAW instruction.
2229///
2230/// \param __a
2231/// A 256-bit vector of [16 x i16] to be shifted.
2232/// \param __count
2233/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2234/// shift count (in bits). The upper element is ignored.
2235/// \returns A 256-bit vector of [16 x i16] containing the result.
2236static __inline__ __m256i __DEFAULT_FN_ATTRS256
2237_mm256_sra_epi16(__m256i __a, __m128i __count)
2238{
2239 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2240}
2241
2242/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2243/// right by \a __count bits, shifting in sign bits, and returns the result.
2244/// If \a __count is greater than 31, each element of the result is either
2245/// 0 or -1 according to the corresponding input sign bit.
2246///
2247/// \headerfile <immintrin.h>
2248///
2249/// This intrinsic corresponds to the \c VPSRAD instruction.
2250///
2251/// \param __a
2252/// A 256-bit vector of [8 x i32] to be shifted.
2253/// \param __count
2254/// An unsigned integer value specifying the shift count (in bits).
2255/// \returns A 256-bit vector of [8 x i32] containing the result.
2256static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2257_mm256_srai_epi32(__m256i __a, int __count) {
2258 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2259}
2260
2261/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2262/// right by the number of bits given in the lower 64 bits of \a __count,
2263/// shifting in sign bits, and returns the result. If \a __count is greater
2264/// than 31, each element of the result is either 0 or -1 according to the
2265/// corresponding input sign bit.
2266///
2267/// \headerfile <immintrin.h>
2268///
2269/// This intrinsic corresponds to the \c VPSRAD instruction.
2270///
2271/// \param __a
2272/// A 256-bit vector of [8 x i32] to be shifted.
2273/// \param __count
2274/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2275/// shift count (in bits). The upper element is ignored.
2276/// \returns A 256-bit vector of [8 x i32] containing the result.
2277static __inline__ __m256i __DEFAULT_FN_ATTRS256
2278_mm256_sra_epi32(__m256i __a, __m128i __count)
2279{
2280 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2281}
2282
2283/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2284/// \a imm bytes, shifting in zero bytes, and returns the result. If
2285/// \a imm is greater than 15, the returned result is all zeroes.
2286///
2287/// \headerfile <immintrin.h>
2288///
2289/// \code
2290/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2291/// \endcode
2292///
2293/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2294///
2295/// \param a
2296/// A 256-bit integer vector to be shifted.
2297/// \param imm
2298/// An unsigned immediate value specifying the shift count (in bytes).
2299/// \returns A 256-bit integer vector containing the result.
2300#define _mm256_srli_si256(a, imm) \
2301 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2302 (int)(imm)))
2303
2304/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2305/// \a imm bytes, shifting in zero bytes, and returns the result. If
2306/// \a imm is greater than 15, the returned result is all zeroes.
2307///
2308/// \headerfile <immintrin.h>
2309///
2310/// \code
2311/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2312/// \endcode
2313///
2314/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2315///
2316/// \param a
2317/// A 256-bit integer vector to be shifted.
2318/// \param imm
2319/// An unsigned immediate value specifying the shift count (in bytes).
2320/// \returns A 256-bit integer vector containing the result.
2321#define _mm256_bsrli_epi128(a, imm) \
2322 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2323 (int)(imm)))
2324
2325/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2326/// right by \a __count bits, shifting in zero bits, and returns the result.
2327/// If \a __count is greater than 15, the returned result is all zeroes.
2328///
2329/// \headerfile <immintrin.h>
2330///
2331/// This intrinsic corresponds to the \c VPSRLW instruction.
2332///
2333/// \param __a
2334/// A 256-bit vector of [16 x i16] to be shifted.
2335/// \param __count
2336/// An unsigned integer value specifying the shift count (in bits).
2337/// \returns A 256-bit vector of [16 x i16] containing the result.
2338static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2339_mm256_srli_epi16(__m256i __a, int __count) {
2340 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2341}
2342
2343/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2344/// right by the number of bits given in the lower 64 bits of \a __count,
2345/// shifting in zero bits, and returns the result. If \a __count is greater
2346/// than 15, the returned result is all zeroes.
2347///
2348/// \headerfile <immintrin.h>
2349///
2350/// This intrinsic corresponds to the \c VPSRLW instruction.
2351///
2352/// \param __a
2353/// A 256-bit vector of [16 x i16] to be shifted.
2354/// \param __count
2355/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2356/// shift count (in bits). The upper element is ignored.
2357/// \returns A 256-bit vector of [16 x i16] containing the result.
2358static __inline__ __m256i __DEFAULT_FN_ATTRS256
2359_mm256_srl_epi16(__m256i __a, __m128i __count)
2360{
2361 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2362}
2363
2364/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2365/// right by \a __count bits, shifting in zero bits, and returns the result.
2366/// If \a __count is greater than 31, the returned result is all zeroes.
2367///
2368/// \headerfile <immintrin.h>
2369///
2370/// This intrinsic corresponds to the \c VPSRLD instruction.
2371///
2372/// \param __a
2373/// A 256-bit vector of [8 x i32] to be shifted.
2374/// \param __count
2375/// An unsigned integer value specifying the shift count (in bits).
2376/// \returns A 256-bit vector of [8 x i32] containing the result.
2377static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2378_mm256_srli_epi32(__m256i __a, int __count) {
2379 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2380}
2381
2382/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2383/// right by the number of bits given in the lower 64 bits of \a __count,
2384/// shifting in zero bits, and returns the result. If \a __count is greater
2385/// than 31, the returned result is all zeroes.
2386///
2387/// \headerfile <immintrin.h>
2388///
2389/// This intrinsic corresponds to the \c VPSRLD instruction.
2390///
2391/// \param __a
2392/// A 256-bit vector of [8 x i32] to be shifted.
2393/// \param __count
2394/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2395/// shift count (in bits). The upper element is ignored.
2396/// \returns A 256-bit vector of [8 x i32] containing the result.
2397static __inline__ __m256i __DEFAULT_FN_ATTRS256
2398_mm256_srl_epi32(__m256i __a, __m128i __count)
2399{
2400 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2401}
2402
2403/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2404/// right by \a __count bits, shifting in zero bits, and returns the result.
2405/// If \a __count is greater than 63, the returned result is all zeroes.
2406///
2407/// \headerfile <immintrin.h>
2408///
2409/// This intrinsic corresponds to the \c VPSRLQ instruction.
2410///
2411/// \param __a
2412/// A 256-bit vector of [4 x i64] to be shifted.
2413/// \param __count
2414/// An unsigned integer value specifying the shift count (in bits).
2415/// \returns A 256-bit vector of [4 x i64] containing the result.
2416static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2417_mm256_srli_epi64(__m256i __a, int __count) {
2418 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2419}
2420
2421/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2422/// right by the number of bits given in the lower 64 bits of \a __count,
2423/// shifting in zero bits, and returns the result. If \a __count is greater
2424/// than 63, the returned result is all zeroes.
2425///
2426/// \headerfile <immintrin.h>
2427///
2428/// This intrinsic corresponds to the \c VPSRLQ instruction.
2429///
2430/// \param __a
2431/// A 256-bit vector of [4 x i64] to be shifted.
2432/// \param __count
2433/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2434/// shift count (in bits). The upper element is ignored.
2435/// \returns A 256-bit vector of [4 x i64] containing the result.
2436static __inline__ __m256i __DEFAULT_FN_ATTRS256
2437_mm256_srl_epi64(__m256i __a, __m128i __count)
2438{
2439 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2440}
2441
2442/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2443/// vectors. Returns the lower 8 bits of each difference in the
2444/// corresponding byte of the 256-bit integer vector result (overflow is
2445/// ignored).
2446///
2447/// \code{.operation}
2448/// FOR i := 0 TO 31
2449/// j := i*8
2450/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2451/// ENDFOR
2452/// \endcode
2453///
2454/// \headerfile <immintrin.h>
2455///
2456/// This intrinsic corresponds to the \c VPSUBB instruction.
2457///
2458/// \param __a
2459/// A 256-bit integer vector containing the minuends.
2460/// \param __b
2461/// A 256-bit integer vector containing the subtrahends.
2462/// \returns A 256-bit integer vector containing the differences.
2463static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2464_mm256_sub_epi8(__m256i __a, __m256i __b) {
2465 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2466}
2467
2468/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2469/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2470/// the corresponding element of the [16 x i16] result (overflow is
2471/// ignored).
2472///
2473/// \code{.operation}
2474/// FOR i := 0 TO 15
2475/// j := i*16
2476/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2477/// ENDFOR
2478/// \endcode
2479///
2480/// \headerfile <immintrin.h>
2481///
2482/// This intrinsic corresponds to the \c VPSUBW instruction.
2483///
2484/// \param __a
2485/// A 256-bit vector of [16 x i16] containing the minuends.
2486/// \param __b
2487/// A 256-bit vector of [16 x i16] containing the subtrahends.
2488/// \returns A 256-bit vector of [16 x i16] containing the differences.
2489static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2490_mm256_sub_epi16(__m256i __a, __m256i __b) {
2491 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2492}
2493
2494/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2495/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2496/// the corresponding element of the [8 x i32] result (overflow is ignored).
2497///
2498/// \code{.operation}
2499/// FOR i := 0 TO 7
2500/// j := i*32
2501/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2502/// ENDFOR
2503/// \endcode
2504///
2505/// \headerfile <immintrin.h>
2506///
2507/// This intrinsic corresponds to the \c VPSUBD instruction.
2508///
2509/// \param __a
2510/// A 256-bit vector of [8 x i32] containing the minuends.
2511/// \param __b
2512/// A 256-bit vector of [8 x i32] containing the subtrahends.
2513/// \returns A 256-bit vector of [8 x i32] containing the differences.
2514static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2515_mm256_sub_epi32(__m256i __a, __m256i __b) {
2516 return (__m256i)((__v8su)__a - (__v8su)__b);
2517}
2518
2519/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2520/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2521/// the corresponding element of the [4 x i64] result (overflow is ignored).
2522///
2523/// \code{.operation}
2524/// FOR i := 0 TO 3
2525/// j := i*64
2526/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2527/// ENDFOR
2528/// \endcode
2529///
2530/// \headerfile <immintrin.h>
2531///
2532/// This intrinsic corresponds to the \c VPSUBQ instruction.
2533///
2534/// \param __a
2535/// A 256-bit vector of [4 x i64] containing the minuends.
2536/// \param __b
2537/// A 256-bit vector of [4 x i64] containing the subtrahends.
2538/// \returns A 256-bit vector of [4 x i64] containing the differences.
2539static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2540_mm256_sub_epi64(__m256i __a, __m256i __b) {
2541 return (__m256i)((__v4du)__a - (__v4du)__b);
2542}
2543
2544/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2545/// vectors using signed saturation, and returns each differences in the
2546/// corresponding byte of the 256-bit integer vector result.
2547///
2548/// \code{.operation}
2549/// FOR i := 0 TO 31
2550/// j := i*8
2551/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2552/// ENDFOR
2553/// \endcode
2554///
2555/// \headerfile <immintrin.h>
2556///
2557/// This intrinsic corresponds to the \c VPSUBSB instruction.
2558///
2559/// \param __a
2560/// A 256-bit integer vector containing the minuends.
2561/// \param __b
2562/// A 256-bit integer vector containing the subtrahends.
2563/// \returns A 256-bit integer vector containing the differences.
2564static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2565_mm256_subs_epi8(__m256i __a, __m256i __b) {
2566 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2567}
2568
2569/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2570/// vectors of [16 x i16] using signed saturation, and returns each
2571/// difference in the corresponding element of the [16 x i16] result.
2572///
2573/// \code{.operation}
2574/// FOR i := 0 TO 15
2575/// j := i*16
2576/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2577/// ENDFOR
2578/// \endcode
2579///
2580/// \headerfile <immintrin.h>
2581///
2582/// This intrinsic corresponds to the \c VPSUBSW instruction.
2583///
2584/// \param __a
2585/// A 256-bit vector of [16 x i16] containing the minuends.
2586/// \param __b
2587/// A 256-bit vector of [16 x i16] containing the subtrahends.
2588/// \returns A 256-bit vector of [16 x i16] containing the differences.
2589static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2590_mm256_subs_epi16(__m256i __a, __m256i __b) {
2591 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2592}
2593
2594/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2595/// vectors using unsigned saturation, and returns each difference in the
2596/// corresponding byte of the 256-bit integer vector result. For each byte,
2597/// computes <c> result = __a - __b </c>.
2598///
2599/// \code{.operation}
2600/// FOR i := 0 TO 31
2601/// j := i*8
2602/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2603/// ENDFOR
2604/// \endcode
2605///
2606/// \headerfile <immintrin.h>
2607///
2608/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2609///
2610/// \param __a
2611/// A 256-bit integer vector containing the minuends.
2612/// \param __b
2613/// A 256-bit integer vector containing the subtrahends.
2614/// \returns A 256-bit integer vector containing the differences.
2615static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2616_mm256_subs_epu8(__m256i __a, __m256i __b) {
2617 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2618}
2619
2620/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2621/// vectors of [16 x i16] using unsigned saturation, and returns each
2622/// difference in the corresponding element of the [16 x i16] result.
2623///
2624/// \code{.operation}
2625/// FOR i := 0 TO 15
2626/// j := i*16
2627/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2628/// ENDFOR
2629/// \endcode
2630///
2631/// \headerfile <immintrin.h>
2632///
2633/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2634///
2635/// \param __a
2636/// A 256-bit vector of [16 x i16] containing the minuends.
2637/// \param __b
2638/// A 256-bit vector of [16 x i16] containing the subtrahends.
2639/// \returns A 256-bit vector of [16 x i16] containing the differences.
2640static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2641_mm256_subs_epu16(__m256i __a, __m256i __b) {
2642 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2643}
2644
2645/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2646/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2647/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2648/// input; other bits in these parameters are ignored.
2649///
2650/// \code{.operation}
2651/// result[7:0] := __a[71:64]
2652/// result[15:8] := __b[71:64]
2653/// result[23:16] := __a[79:72]
2654/// result[31:24] := __b[79:72]
2655/// . . .
2656/// result[127:120] := __b[127:120]
2657/// result[135:128] := __a[199:192]
2658/// . . .
2659/// result[255:248] := __b[255:248]
2660/// \endcode
2661///
2662/// \headerfile <immintrin.h>
2663///
2664/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2665///
2666/// \param __a
2667/// A 256-bit integer vector used as the source for the even-numbered bytes
2668/// of the result.
2669/// \param __b
2670/// A 256-bit integer vector used as the source for the odd-numbered bytes
2671/// of the result.
2672/// \returns A 256-bit integer vector containing the result.
2673static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2674_mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
2675 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2676}
2677
2678/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2679/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2680/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2681/// 128-bit half of \a __a and \a __b as input; other bits in these
2682/// parameters are ignored.
2683///
2684/// \code{.operation}
2685/// result[15:0] := __a[79:64]
2686/// result[31:16] := __b[79:64]
2687/// result[47:32] := __a[95:80]
2688/// result[63:48] := __b[95:80]
2689/// . . .
2690/// result[127:112] := __b[127:112]
2691/// result[143:128] := __a[211:196]
2692/// . . .
2693/// result[255:240] := __b[255:240]
2694/// \endcode
2695///
2696/// \headerfile <immintrin.h>
2697///
2698/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2699///
2700/// \param __a
2701/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2702/// elements of the result.
2703/// \param __b
2704/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2705/// elements of the result.
2706/// \returns A 256-bit vector of [16 x i16] containing the result.
2707static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2708_mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
2709 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2710}
2711
2712/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2713/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2714/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2715/// of \a __a and \a __b as input; other bits in these parameters are
2716/// ignored.
2717///
2718/// \code{.operation}
2719/// result[31:0] := __a[95:64]
2720/// result[63:32] := __b[95:64]
2721/// result[95:64] := __a[127:96]
2722/// result[127:96] := __b[127:96]
2723/// result[159:128] := __a[223:192]
2724/// result[191:160] := __b[223:192]
2725/// result[223:192] := __a[255:224]
2726/// result[255:224] := __b[255:224]
2727/// \endcode
2728///
2729/// \headerfile <immintrin.h>
2730///
2731/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2732///
2733/// \param __a
2734/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2735/// elements of the result.
2736/// \param __b
2737/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2738/// elements of the result.
2739/// \returns A 256-bit vector of [8 x i32] containing the result.
2740static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2741_mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
2742 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2743}
2744
2745/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2746/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2747/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2748/// of \a __a and \a __b as input; other bits in these parameters are
2749/// ignored.
2750///
2751/// \code{.operation}
2752/// result[63:0] := __a[127:64]
2753/// result[127:64] := __b[127:64]
2754/// result[191:128] := __a[255:192]
2755/// result[255:192] := __b[255:192]
2756/// \endcode
2757///
2758/// \headerfile <immintrin.h>
2759///
2760/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2761///
2762/// \param __a
2763/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2764/// elements of the result.
2765/// \param __b
2766/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2767/// elements of the result.
2768/// \returns A 256-bit vector of [4 x i64] containing the result.
2769static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2770_mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
2771 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2772}
2773
2774/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2775/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2776/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2777/// input; other bits in these parameters are ignored.
2778///
2779/// \code{.operation}
2780/// result[7:0] := __a[7:0]
2781/// result[15:8] := __b[7:0]
2782/// result[23:16] := __a[15:8]
2783/// result[31:24] := __b[15:8]
2784/// . . .
2785/// result[127:120] := __b[63:56]
2786/// result[135:128] := __a[135:128]
2787/// . . .
2788/// result[255:248] := __b[191:184]
2789/// \endcode
2790///
2791/// \headerfile <immintrin.h>
2792///
2793/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2794///
2795/// \param __a
2796/// A 256-bit integer vector used as the source for the even-numbered bytes
2797/// of the result.
2798/// \param __b
2799/// A 256-bit integer vector used as the source for the odd-numbered bytes
2800/// of the result.
2801/// \returns A 256-bit integer vector containing the result.
2802static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2803_mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
2804 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2805}
2806
2807/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2808/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2809/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2810/// 128-bit half of \a __a and \a __b as input; other bits in these
2811/// parameters are ignored.
2812///
2813/// \code{.operation}
2814/// result[15:0] := __a[15:0]
2815/// result[31:16] := __b[15:0]
2816/// result[47:32] := __a[31:16]
2817/// result[63:48] := __b[31:16]
2818/// . . .
2819/// result[127:112] := __b[63:48]
2820/// result[143:128] := __a[143:128]
2821/// . . .
2822/// result[255:239] := __b[191:176]
2823/// \endcode
2824///
2825/// \headerfile <immintrin.h>
2826///
2827/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2828///
2829/// \param __a
2830/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2831/// elements of the result.
2832/// \param __b
2833/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2834/// elements of the result.
2835/// \returns A 256-bit vector of [16 x i16] containing the result.
2836static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2837_mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
2838 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2839}
2840
2841/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2842/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2843/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2844/// of \a __a and \a __b as input; other bits in these parameters are
2845/// ignored.
2846///
2847/// \code{.operation}
2848/// result[31:0] := __a[31:0]
2849/// result[63:32] := __b[31:0]
2850/// result[95:64] := __a[63:32]
2851/// result[127:96] := __b[63:32]
2852/// result[159:128] := __a[159:128]
2853/// result[191:160] := __b[159:128]
2854/// result[223:192] := __a[191:160]
2855/// result[255:224] := __b[191:190]
2856/// \endcode
2857///
2858/// \headerfile <immintrin.h>
2859///
2860/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2861///
2862/// \param __a
2863/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2864/// elements of the result.
2865/// \param __b
2866/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2867/// elements of the result.
2868/// \returns A 256-bit vector of [8 x i32] containing the result.
2869static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2870_mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
2871 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2872}
2873
2874/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2875/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2876/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2877/// of \a __a and \a __b as input; other bits in these parameters are
2878/// ignored.
2879///
2880/// \code{.operation}
2881/// result[63:0] := __a[63:0]
2882/// result[127:64] := __b[63:0]
2883/// result[191:128] := __a[191:128]
2884/// result[255:192] := __b[191:128]
2885/// \endcode
2886///
2887/// \headerfile <immintrin.h>
2888///
2889/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2890///
2891/// \param __a
2892/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2893/// elements of the result.
2894/// \param __b
2895/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2896/// elements of the result.
2897/// \returns A 256-bit vector of [4 x i64] containing the result.
2898static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2899_mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
2900 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2901}
2902
2903/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2904/// \a __b.
2905///
2906/// \headerfile <immintrin.h>
2907///
2908/// This intrinsic corresponds to the \c VPXOR instruction.
2909///
2910/// \param __a
2911/// A 256-bit integer vector.
2912/// \param __b
2913/// A 256-bit integer vector.
2914/// \returns A 256-bit integer vector containing the result.
2915static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
2916_mm256_xor_si256(__m256i __a, __m256i __b)
2917{
2918 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2919}
2920
2921/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2922/// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2923/// boundary.
2924///
2925/// \headerfile <immintrin.h>
2926///
2927/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2928///
2929/// \param __V
2930/// A pointer to the 32-byte aligned memory containing the vector to load.
2931/// \returns A 256-bit integer vector loaded from memory.
2932static __inline__ __m256i __DEFAULT_FN_ATTRS256
2934{
2935 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2936 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2937}
2938
2939/// Broadcasts the 32-bit floating-point value from the low element of the
2940/// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2941/// 128-bit vector of [4 x float].
2942///
2943/// \headerfile <immintrin.h>
2944///
2945/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2946///
2947/// \param __X
2948/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2949/// \returns A 128-bit vector of [4 x float] containing the result.
2950static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
2952 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
2953}
2954
2955/// Broadcasts the 64-bit floating-point value from the low element of the
2956/// 128-bit vector of [2 x double] in \a __a to both elements of the
2957/// result's 128-bit vector of [2 x double].
2958///
2959/// \headerfile <immintrin.h>
2960///
2961/// This intrinsic corresponds to the \c MOVDDUP instruction.
2962///
2963/// \param __a
2964/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2965/// \returns A 128-bit vector of [2 x double] containing the result.
2966static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
2968 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
2969}
2970
2971/// Broadcasts the 32-bit floating-point value from the low element of the
2972/// 128-bit vector of [4 x float] in \a __X to all elements of the
2973/// result's 256-bit vector of [8 x float].
2974///
2975/// \headerfile <immintrin.h>
2976///
2977/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2978///
2979/// \param __X
2980/// A 128-bit vector of [4 x float] whose low element will be broadcast.
2981/// \returns A 256-bit vector of [8 x float] containing the result.
2982static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
2984 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2985}
2986
2987/// Broadcasts the 64-bit floating-point value from the low element of the
2988/// 128-bit vector of [2 x double] in \a __X to all elements of the
2989/// result's 256-bit vector of [4 x double].
2990///
2991/// \headerfile <immintrin.h>
2992///
2993/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
2994///
2995/// \param __X
2996/// A 128-bit vector of [2 x double] whose low element will be broadcast.
2997/// \returns A 256-bit vector of [4 x double] containing the result.
2998static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
3000 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3001}
3002
3003/// Broadcasts the 128-bit integer data from \a __X to both the lower and
3004/// upper halves of the 256-bit result.
3005///
3006/// \headerfile <immintrin.h>
3007///
3008/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3009///
3010/// \param __X
3011/// A 128-bit integer vector to be broadcast.
3012/// \returns A 256-bit integer vector containing the result.
3013static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3015 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3016}
3017
3018#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3019
3020/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3021/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3022/// as specified by the immediate integer operand \a M.
3023///
3024/// \code{.operation}
3025/// FOR i := 0 TO 3
3026/// j := i*32
3027/// IF M[i] == 0
3028/// result[31+j:j] := V1[31+j:j]
3029/// ELSE
3030/// result[31+j:j] := V2[32+j:j]
3031/// FI
3032/// ENDFOR
3033/// \endcode
3034///
3035/// \headerfile <immintrin.h>
3036///
3037/// \code
3038/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3039/// \endcode
3040///
3041/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3042///
3043/// \param V1
3044/// A 128-bit vector of [4 x i32] containing source values.
3045/// \param V2
3046/// A 128-bit vector of [4 x i32] containing source values.
3047/// \param M
3048/// An immediate 8-bit integer operand, with bits [3:0] specifying the
3049/// source for each element of the result. The position of the mask bit
3050/// corresponds to the index of a copied value. When a mask bit is 0, the
3051/// element is copied from \a V1; otherwise, it is copied from \a V2.
3052/// \returns A 128-bit vector of [4 x i32] containing the result.
3053#define _mm_blend_epi32(V1, V2, M) \
3054 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3055 (__v4si)(__m128i)(V2), (int)(M)))
3056
3057/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3058/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3059/// as specified by the immediate integer operand \a M.
3060///
3061/// \code{.operation}
3062/// FOR i := 0 TO 7
3063/// j := i*32
3064/// IF M[i] == 0
3065/// result[31+j:j] := V1[31+j:j]
3066/// ELSE
3067/// result[31+j:j] := V2[32+j:j]
3068/// FI
3069/// ENDFOR
3070/// \endcode
3071///
3072/// \headerfile <immintrin.h>
3073///
3074/// \code
3075/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3076/// \endcode
3077///
3078/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3079///
3080/// \param V1
3081/// A 256-bit vector of [8 x i32] containing source values.
3082/// \param V2
3083/// A 256-bit vector of [8 x i32] containing source values.
3084/// \param M
3085/// An immediate 8-bit integer operand, with bits [7:0] specifying the
3086/// source for each element of the result. The position of the mask bit
3087/// corresponds to the index of a copied value. When a mask bit is 0, the
3088/// element is copied from \a V1; otherwise, it is is copied from \a V2.
3089/// \returns A 256-bit vector of [8 x i32] containing the result.
3090#define _mm256_blend_epi32(V1, V2, M) \
3091 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3092 (__v8si)(__m256i)(V2), (int)(M)))
3093
3094/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3095/// bytes of the 256-bit result.
3096///
3097/// \headerfile <immintrin.h>
3098///
3099/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3100///
3101/// \param __X
3102/// A 128-bit integer vector whose low byte will be broadcast.
3103/// \returns A 256-bit integer vector containing the result.
3104static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3106 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3107}
3108
3109/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3110/// to all elements of the result's 256-bit vector of [16 x i16].
3111///
3112/// \headerfile <immintrin.h>
3113///
3114/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3115///
3116/// \param __X
3117/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3118/// \returns A 256-bit vector of [16 x i16] containing the result.
3119static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3121 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3122}
3123
3124/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3125/// to all elements of the result's 256-bit vector of [8 x i32].
3126///
3127/// \headerfile <immintrin.h>
3128///
3129/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3130///
3131/// \param __X
3132/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3133/// \returns A 256-bit vector of [8 x i32] containing the result.
3134static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3136 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3137}
3138
3139/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3140/// to all elements of the result's 256-bit vector of [4 x i64].
3141///
3142/// \headerfile <immintrin.h>
3143///
3144/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3145///
3146/// \param __X
3147/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3148/// \returns A 256-bit vector of [4 x i64] containing the result.
3149static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3151 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3152}
3153
3154/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3155/// bytes of the 128-bit result.
3156///
3157/// \headerfile <immintrin.h>
3158///
3159/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3160///
3161/// \param __X
3162/// A 128-bit integer vector whose low byte will be broadcast.
3163/// \returns A 128-bit integer vector containing the result.
3164static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3166 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3167}
3168
3169/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3170/// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3171///
3172/// \headerfile <immintrin.h>
3173///
3174/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3175///
3176/// \param __X
3177/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3178/// \returns A 128-bit vector of [8 x i16] containing the result.
3179static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3181 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3182}
3183
3184/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3185/// to all elements of the result's vector of [4 x i32].
3186///
3187/// \headerfile <immintrin.h>
3188///
3189/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3190///
3191/// \param __X
3192/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3193/// \returns A 128-bit vector of [4 x i32] containing the result.
3194static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3196 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3197}
3198
3199/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3200/// to both elements of the result's 128-bit vector of [2 x i64].
3201///
3202/// \headerfile <immintrin.h>
3203///
3204/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3205///
3206/// \param __X
3207/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3208/// \returns A 128-bit vector of [2 x i64] containing the result.
3209static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3211 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3212}
3213
3214/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3215/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3216/// elements of the 256-bit vector of [8 x i32] in \a __b.
3217///
3218/// \code{.operation}
3219/// FOR i := 0 TO 7
3220/// j := i*32
3221/// k := __b[j+2:j] * 32
3222/// result[j+31:j] := __a[k+31:k]
3223/// ENDFOR
3224/// \endcode
3225///
3226/// \headerfile <immintrin.h>
3227///
3228/// This intrinsic corresponds to the \c VPERMD instruction.
3229///
3230/// \param __a
3231/// A 256-bit vector of [8 x i32] containing the source values.
3232/// \param __b
3233/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3234/// \a __a.
3235/// \returns A 256-bit vector of [8 x i32] containing the result.
3236static __inline__ __m256i __DEFAULT_FN_ATTRS256
3238{
3239 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3240}
3241
3242/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3243/// the 256-bit vector of [4 x double] in \a V as specified by the
3244/// immediate value \a M.
3245///
3246/// \code{.operation}
3247/// FOR i := 0 TO 3
3248/// j := i*64
3249/// k := (M >> i*2)[1:0] * 64
3250/// result[j+63:j] := V[k+63:k]
3251/// ENDFOR
3252/// \endcode
3253///
3254/// \headerfile <immintrin.h>
3255///
3256/// \code
3257/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3258/// \endcode
3259///
3260/// This intrinsic corresponds to the \c VPERMPD instruction.
3261///
3262/// \param V
3263/// A 256-bit vector of [4 x double] containing the source values.
3264/// \param M
3265/// An immediate 8-bit value specifying which elements to copy from \a V.
3266/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3267/// \a M[3:2] specifies the index for element 1, and so forth.
3268/// \returns A 256-bit vector of [4 x double] containing the result.
3269#define _mm256_permute4x64_pd(V, M) \
3270 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3271
3272/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3273/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3274/// the elements of the 256-bit vector of [8 x i32] in \a __b.
3275///
3276/// \code{.operation}
3277/// FOR i := 0 TO 7
3278/// j := i*32
3279/// k := __b[j+2:j] * 32
3280/// result[j+31:j] := __a[k+31:k]
3281/// ENDFOR
3282/// \endcode
3283///
3284/// \headerfile <immintrin.h>
3285///
3286/// This intrinsic corresponds to the \c VPERMPS instruction.
3287///
3288/// \param __a
3289/// A 256-bit vector of [8 x float] containing the source values.
3290/// \param __b
3291/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3292/// \a __a.
3293/// \returns A 256-bit vector of [8 x float] containing the result.
3294static __inline__ __m256 __DEFAULT_FN_ATTRS256
3296{
3297 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3298}
3299
3300/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3301/// of the 256-bit vector of [4 x i64] in \a V as specified by the
3302/// immediate value \a M.
3303///
3304/// \code{.operation}
3305/// FOR i := 0 TO 3
3306/// j := i*64
3307/// k := (M >> i*2)[1:0] * 64
3308/// result[j+63:j] := V[k+63:k]
3309/// ENDFOR
3310/// \endcode
3311///
3312/// \headerfile <immintrin.h>
3313///
3314/// \code
3315/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3316/// \endcode
3317///
3318/// This intrinsic corresponds to the \c VPERMQ instruction.
3319///
3320/// \param V
3321/// A 256-bit vector of [4 x i64] containing the source values.
3322/// \param M
3323/// An immediate 8-bit value specifying which elements to copy from \a V.
3324/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3325/// \a M[3:2] specifies the index for element 1, and so forth.
3326/// \returns A 256-bit vector of [4 x i64] containing the result.
3327#define _mm256_permute4x64_epi64(V, M) \
3328 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3329
3330/// Sets each half of the 256-bit result either to zero or to one of the
3331/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3332/// as specified by the immediate value \a M.
3333///
3334/// \code{.operation}
3335/// FOR i := 0 TO 1
3336/// j := i*128
3337/// k := M >> (i*4)
3338/// IF k[3] == 0
3339/// CASE (k[1:0]) OF
3340/// 0: result[127+j:j] := V1[127:0]
3341/// 1: result[127+j:j] := V1[255:128]
3342/// 2: result[127+j:j] := V2[127:0]
3343/// 3: result[127+j:j] := V2[255:128]
3344/// ESAC
3345/// ELSE
3346/// result[127+j:j] := 0
3347/// FI
3348/// ENDFOR
3349/// \endcode
3350///
3351/// \headerfile <immintrin.h>
3352///
3353/// \code
3354/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3355/// \endcode
3356///
3357/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3358///
3359/// \param V1
3360/// A 256-bit integer vector containing source values.
3361/// \param V2
3362/// A 256-bit integer vector containing source values.
3363/// \param M
3364/// An immediate value specifying how to form the result. Bits [3:0]
3365/// control the lower half of the result, bits [7:4] control the upper half.
3366/// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3367/// otherwise bits [1:0] determine the source as follows. \n
3368/// 0: the lower half of \a V1 \n
3369/// 1: the upper half of \a V1 \n
3370/// 2: the lower half of \a V2 \n
3371/// 3: the upper half of \a V2
3372/// \returns A 256-bit integer vector containing the result.
3373#define _mm256_permute2x128_si256(V1, V2, M) \
3374 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3375
3376/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3377/// of the immediate \a M is zero, extracts the lower half of the result;
3378/// otherwise, extracts the upper half.
3379///
3380/// \headerfile <immintrin.h>
3381///
3382/// \code
3383/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3384/// \endcode
3385///
3386/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3387///
3388/// \param V
3389/// A 256-bit integer vector containing the source values.
3390/// \param M
3391/// An immediate value specifying which half of \a V to extract.
3392/// \returns A 128-bit integer vector containing the result.
3393#define _mm256_extracti128_si256(V, M) \
3394 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3395
3396/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3397/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3398/// is zero, overwrites the lower half of the result; otherwise,
3399/// overwrites the upper half.
3400///
3401/// \headerfile <immintrin.h>
3402///
3403/// \code
3404/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3405/// \endcode
3406///
3407/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3408///
3409/// \param V1
3410/// A 256-bit integer vector containing a source value.
3411/// \param V2
3412/// A 128-bit integer vector containing a source value.
3413/// \param M
3414/// An immediate value specifying where to put \a V2 in the result.
3415/// \returns A 256-bit integer vector containing the result.
3416#define _mm256_inserti128_si256(V1, V2, M) \
3417 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3418 (__v2di)(__m128i)(V2), (int)(M)))
3419
3420/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3421/// the most significant bit of the corresponding element in the mask
3422/// \a __M is set; otherwise, sets that element of the result to zero.
3423/// Returns the 256-bit [8 x i32] result.
3424///
3425/// \code{.operation}
3426/// FOR i := 0 TO 7
3427/// j := i*32
3428/// IF __M[j+31] == 1
3429/// result[j+31:j] := Load32(__X+(i*4))
3430/// ELSE
3431/// result[j+31:j] := 0
3432/// FI
3433/// ENDFOR
3434/// \endcode
3435///
3436/// \headerfile <immintrin.h>
3437///
3438/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3439///
3440/// \param __X
3441/// A pointer to the memory used for loading values.
3442/// \param __M
3443/// A 256-bit vector of [8 x i32] containing the mask bits.
3444/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3445/// elements.
3446static __inline__ __m256i __DEFAULT_FN_ATTRS256
3447_mm256_maskload_epi32(int const *__X, __m256i __M)
3448{
3449 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3450}
3451
3452/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3453/// the most significant bit of the corresponding element in the mask
3454/// \a __M is set; otherwise, sets that element of the result to zero.
3455/// Returns the 256-bit [4 x i64] result.
3456///
3457/// \code{.operation}
3458/// FOR i := 0 TO 3
3459/// j := i*64
3460/// IF __M[j+63] == 1
3461/// result[j+63:j] := Load64(__X+(i*8))
3462/// ELSE
3463/// result[j+63:j] := 0
3464/// FI
3465/// ENDFOR
3466/// \endcode
3467///
3468/// \headerfile <immintrin.h>
3469///
3470/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3471///
3472/// \param __X
3473/// A pointer to the memory used for loading values.
3474/// \param __M
3475/// A 256-bit vector of [4 x i64] containing the mask bits.
3476/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3477/// elements.
3478static __inline__ __m256i __DEFAULT_FN_ATTRS256
3479_mm256_maskload_epi64(long long const *__X, __m256i __M)
3480{
3481 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3482}
3483
3484/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3485/// the most significant bit of the corresponding element in the mask
3486/// \a __M is set; otherwise, sets that element of the result to zero.
3487/// Returns the 128-bit [4 x i32] result.
3488///
3489/// \code{.operation}
3490/// FOR i := 0 TO 3
3491/// j := i*32
3492/// IF __M[j+31] == 1
3493/// result[j+31:j] := Load32(__X+(i*4))
3494/// ELSE
3495/// result[j+31:j] := 0
3496/// FI
3497/// ENDFOR
3498/// \endcode
3499///
3500/// \headerfile <immintrin.h>
3501///
3502/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3503///
3504/// \param __X
3505/// A pointer to the memory used for loading values.
3506/// \param __M
3507/// A 128-bit vector of [4 x i32] containing the mask bits.
3508/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3509/// elements.
3510static __inline__ __m128i __DEFAULT_FN_ATTRS128
3511_mm_maskload_epi32(int const *__X, __m128i __M)
3512{
3513 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3514}
3515
3516/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3517/// the most significant bit of the corresponding element in the mask
3518/// \a __M is set; otherwise, sets that element of the result to zero.
3519/// Returns the 128-bit [2 x i64] result.
3520///
3521/// \code{.operation}
3522/// FOR i := 0 TO 1
3523/// j := i*64
3524/// IF __M[j+63] == 1
3525/// result[j+63:j] := Load64(__X+(i*8))
3526/// ELSE
3527/// result[j+63:j] := 0
3528/// FI
3529/// ENDFOR
3530/// \endcode
3531///
3532/// \headerfile <immintrin.h>
3533///
3534/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3535///
3536/// \param __X
3537/// A pointer to the memory used for loading values.
3538/// \param __M
3539/// A 128-bit vector of [2 x i64] containing the mask bits.
3540/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3541/// elements.
3542static __inline__ __m128i __DEFAULT_FN_ATTRS128
3543_mm_maskload_epi64(long long const *__X, __m128i __M)
3544{
3545 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3546}
3547
3548/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3549/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3550/// the corresponding element in the mask \a __M is set; otherwise, the
3551/// memory element is unchanged.
3552///
3553/// \code{.operation}
3554/// FOR i := 0 TO 7
3555/// j := i*32
3556/// IF __M[j+31] == 1
3557/// Store32(__X+(i*4), __Y[j+31:j])
3558/// FI
3559/// ENDFOR
3560/// \endcode
3561///
3562/// \headerfile <immintrin.h>
3563///
3564/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3565///
3566/// \param __X
3567/// A pointer to the memory used for storing values.
3568/// \param __M
3569/// A 256-bit vector of [8 x i32] containing the mask bits.
3570/// \param __Y
3571/// A 256-bit vector of [8 x i32] containing the values to store.
3572static __inline__ void __DEFAULT_FN_ATTRS256
3573_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3574{
3575 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3576}
3577
3578/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3579/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3580/// the corresponding element in the mask \a __M is set; otherwise, the
3581/// memory element is unchanged.
3582///
3583/// \code{.operation}
3584/// FOR i := 0 TO 3
3585/// j := i*64
3586/// IF __M[j+63] == 1
3587/// Store64(__X+(i*8), __Y[j+63:j])
3588/// FI
3589/// ENDFOR
3590/// \endcode
3591///
3592/// \headerfile <immintrin.h>
3593///
3594/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3595///
3596/// \param __X
3597/// A pointer to the memory used for storing values.
3598/// \param __M
3599/// A 256-bit vector of [4 x i64] containing the mask bits.
3600/// \param __Y
3601/// A 256-bit vector of [4 x i64] containing the values to store.
3602static __inline__ void __DEFAULT_FN_ATTRS256
3603_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3604{
3605 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3606}
3607
3608/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3609/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3610/// the corresponding element in the mask \a __M is set; otherwise, the
3611/// memory element is unchanged.
3612///
3613/// \code{.operation}
3614/// FOR i := 0 TO 3
3615/// j := i*32
3616/// IF __M[j+31] == 1
3617/// Store32(__X+(i*4), __Y[j+31:j])
3618/// FI
3619/// ENDFOR
3620/// \endcode
3621///
3622/// \headerfile <immintrin.h>
3623///
3624/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3625///
3626/// \param __X
3627/// A pointer to the memory used for storing values.
3628/// \param __M
3629/// A 128-bit vector of [4 x i32] containing the mask bits.
3630/// \param __Y
3631/// A 128-bit vector of [4 x i32] containing the values to store.
3632static __inline__ void __DEFAULT_FN_ATTRS128
3633_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3634{
3635 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3636}
3637
3638/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3639/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3640/// the corresponding element in the mask \a __M is set; otherwise, the
3641/// memory element is unchanged.
3642///
3643/// \code{.operation}
3644/// FOR i := 0 TO 1
3645/// j := i*64
3646/// IF __M[j+63] == 1
3647/// Store64(__X+(i*8), __Y[j+63:j])
3648/// FI
3649/// ENDFOR
3650/// \endcode
3651///
3652/// \headerfile <immintrin.h>
3653///
3654/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3655///
3656/// \param __X
3657/// A pointer to the memory used for storing values.
3658/// \param __M
3659/// A 128-bit vector of [2 x i64] containing the mask bits.
3660/// \param __Y
3661/// A 128-bit vector of [2 x i64] containing the values to store.
3662static __inline__ void __DEFAULT_FN_ATTRS128
3663_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3664{
3665 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3666}
3667
3668/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3669/// left by the number of bits given in the corresponding element of the
3670/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3671/// returns the result. If the shift count for any element is greater than
3672/// 31, the result for that element is zero.
3673///
3674/// \headerfile <immintrin.h>
3675///
3676/// This intrinsic corresponds to the \c VPSLLVD instruction.
3677///
3678/// \param __X
3679/// A 256-bit vector of [8 x i32] to be shifted.
3680/// \param __Y
3681/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3682/// bits).
3683/// \returns A 256-bit vector of [8 x i32] containing the result.
3684static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3685_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3686{
3687 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3688}
3689
3690/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3691/// left by the number of bits given in the corresponding element of the
3692/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3693/// returns the result. If the shift count for any element is greater than
3694/// 31, the result for that element is zero.
3695///
3696/// \headerfile <immintrin.h>
3697///
3698/// This intrinsic corresponds to the \c VPSLLVD instruction.
3699///
3700/// \param __X
3701/// A 128-bit vector of [4 x i32] to be shifted.
3702/// \param __Y
3703/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3704/// bits).
3705/// \returns A 128-bit vector of [4 x i32] containing the result.
3706static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3707_mm_sllv_epi32(__m128i __X, __m128i __Y)
3708{
3709 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3710}
3711
3712/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3713/// left by the number of bits given in the corresponding element of the
3714/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3715/// returns the result. If the shift count for any element is greater than
3716/// 63, the result for that element is zero.
3717///
3718/// \headerfile <immintrin.h>
3719///
3720/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3721///
3722/// \param __X
3723/// A 256-bit vector of [4 x i64] to be shifted.
3724/// \param __Y
3725/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3726/// bits).
3727/// \returns A 256-bit vector of [4 x i64] containing the result.
3728static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3729_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3730{
3731 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3732}
3733
3734/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3735/// left by the number of bits given in the corresponding element of the
3736/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3737/// returns the result. If the shift count for any element is greater than
3738/// 63, the result for that element is zero.
3739///
3740/// \headerfile <immintrin.h>
3741///
3742/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3743///
3744/// \param __X
3745/// A 128-bit vector of [2 x i64] to be shifted.
3746/// \param __Y
3747/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3748/// bits).
3749/// \returns A 128-bit vector of [2 x i64] containing the result.
3750static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3751_mm_sllv_epi64(__m128i __X, __m128i __Y)
3752{
3753 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3754}
3755
3756/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3757/// right by the number of bits given in the corresponding element of the
3758/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3759/// returns the result. If the shift count for any element is greater than
3760/// 31, the result for that element is 0 or -1 according to the sign bit
3761/// for that element.
3762///
3763/// \headerfile <immintrin.h>
3764///
3765/// This intrinsic corresponds to the \c VPSRAVD instruction.
3766///
3767/// \param __X
3768/// A 256-bit vector of [8 x i32] to be shifted.
3769/// \param __Y
3770/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3771/// bits).
3772/// \returns A 256-bit vector of [8 x i32] containing the result.
3773static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3774_mm256_srav_epi32(__m256i __X, __m256i __Y)
3775{
3776 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3777}
3778
3779/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3780/// right by the number of bits given in the corresponding element of the
3781/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3782/// returns the result. If the shift count for any element is greater than
3783/// 31, the result for that element is 0 or -1 according to the sign bit
3784/// for that element.
3785///
3786/// \headerfile <immintrin.h>
3787///
3788/// This intrinsic corresponds to the \c VPSRAVD instruction.
3789///
3790/// \param __X
3791/// A 128-bit vector of [4 x i32] to be shifted.
3792/// \param __Y
3793/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3794/// bits).
3795/// \returns A 128-bit vector of [4 x i32] containing the result.
3796static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3797_mm_srav_epi32(__m128i __X, __m128i __Y)
3798{
3799 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3800}
3801
3802/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3803/// right by the number of bits given in the corresponding element of the
3804/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3805/// returns the result. If the shift count for any element is greater than
3806/// 31, the result for that element is zero.
3807///
3808/// \headerfile <immintrin.h>
3809///
3810/// This intrinsic corresponds to the \c VPSRLVD instruction.
3811///
3812/// \param __X
3813/// A 256-bit vector of [8 x i32] to be shifted.
3814/// \param __Y
3815/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3816/// bits).
3817/// \returns A 256-bit vector of [8 x i32] containing the result.
3818static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3819_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3820{
3821 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3822}
3823
3824/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3825/// right by the number of bits given in the corresponding element of the
3826/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3827/// returns the result. If the shift count for any element is greater than
3828/// 31, the result for that element is zero.
3829///
3830/// \headerfile <immintrin.h>
3831///
3832/// This intrinsic corresponds to the \c VPSRLVD instruction.
3833///
3834/// \param __X
3835/// A 128-bit vector of [4 x i32] to be shifted.
3836/// \param __Y
3837/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3838/// bits).
3839/// \returns A 128-bit vector of [4 x i32] containing the result.
3840static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3841_mm_srlv_epi32(__m128i __X, __m128i __Y)
3842{
3843 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3844}
3845
3846/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3847/// right by the number of bits given in the corresponding element of the
3848/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3849/// returns the result. If the shift count for any element is greater than
3850/// 63, the result for that element is zero.
3851///
3852/// \headerfile <immintrin.h>
3853///
3854/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3855///
3856/// \param __X
3857/// A 256-bit vector of [4 x i64] to be shifted.
3858/// \param __Y
3859/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3860/// bits).
3861/// \returns A 256-bit vector of [4 x i64] containing the result.
3862static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3863_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3864{
3865 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3866}
3867
3868/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3869/// right by the number of bits given in the corresponding element of the
3870/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3871/// returns the result. If the shift count for any element is greater than
3872/// 63, the result for that element is zero.
3873///
3874/// \headerfile <immintrin.h>
3875///
3876/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3877///
3878/// \param __X
3879/// A 128-bit vector of [2 x i64] to be shifted.
3880/// \param __Y
3881/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3882/// bits).
3883/// \returns A 128-bit vector of [2 x i64] containing the result.
3884static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3885_mm_srlv_epi64(__m128i __X, __m128i __Y)
3886{
3887 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3888}
3889
3890/// Conditionally gathers two 64-bit floating-point values, either from the
3891/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3892/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3893/// of [2 x double] in \a mask determines the source for each element.
3894///
3895/// \code{.operation}
3896/// FOR element := 0 to 1
3897/// j := element*64
3898/// k := element*32
3899/// IF mask[j+63] == 0
3900/// result[j+63:j] := a[j+63:j]
3901/// ELSE
3902/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3903/// FI
3904/// ENDFOR
3905/// \endcode
3906///
3907/// \headerfile <immintrin.h>
3908///
3909/// \code
3910/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3911/// __m128d mask, const int s);
3912/// \endcode
3913///
3914/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3915///
3916/// \param a
3917/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3918/// zero.
3919/// \param m
3920/// A pointer to the memory used for loading values.
3921/// \param i
3922/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3923/// the first two elements are used.
3924/// \param mask
3925/// A 128-bit vector of [2 x double] containing the mask. The most
3926/// significant bit of each element in the mask vector represents the mask
3927/// bits. If a mask bit is zero, the corresponding value from vector \a a
3928/// is gathered; otherwise the value is loaded from memory.
3929/// \param s
3930/// A literal constant scale factor for the indexes in \a i. Must be
3931/// 1, 2, 4, or 8.
3932/// \returns A 128-bit vector of [2 x double] containing the gathered values.
3933#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
3934 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
3935 (double const *)(m), \
3936 (__v4si)(__m128i)(i), \
3937 (__v2df)(__m128d)(mask), (s)))
3938
3939/// Conditionally gathers four 64-bit floating-point values, either from the
3940/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
3941/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
3942/// of [4 x double] in \a mask determines the source for each element.
3943///
3944/// \code{.operation}
3945/// FOR element := 0 to 3
3946/// j := element*64
3947/// k := element*32
3948/// IF mask[j+63] == 0
3949/// result[j+63:j] := a[j+63:j]
3950/// ELSE
3951/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3952/// FI
3953/// ENDFOR
3954/// \endcode
3955///
3956/// \headerfile <immintrin.h>
3957///
3958/// \code
3959/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
3960/// __m256d mask, const int s);
3961/// \endcode
3962///
3963/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3964///
3965/// \param a
3966/// A 256-bit vector of [4 x double] used as the source when a mask bit is
3967/// zero.
3968/// \param m
3969/// A pointer to the memory used for loading values.
3970/// \param i
3971/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3972/// \param mask
3973/// A 256-bit vector of [4 x double] containing the mask. The most
3974/// significant bit of each element in the mask vector represents the mask
3975/// bits. If a mask bit is zero, the corresponding value from vector \a a
3976/// is gathered; otherwise the value is loaded from memory.
3977/// \param s
3978/// A literal constant scale factor for the indexes in \a i. Must be
3979/// 1, 2, 4, or 8.
3980/// \returns A 256-bit vector of [4 x double] containing the gathered values.
3981#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
3982 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
3983 (double const *)(m), \
3984 (__v4si)(__m128i)(i), \
3985 (__v4df)(__m256d)(mask), (s)))
3986
3987/// Conditionally gathers two 64-bit floating-point values, either from the
3988/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3989/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3990/// of [2 x double] in \a mask determines the source for each element.
3991///
3992/// \code{.operation}
3993/// FOR element := 0 to 1
3994/// j := element*64
3995/// k := element*64
3996/// IF mask[j+63] == 0
3997/// result[j+63:j] := a[j+63:j]
3998/// ELSE
3999/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4000/// FI
4001/// ENDFOR
4002/// \endcode
4003///
4004/// \headerfile <immintrin.h>
4005///
4006/// \code
4007/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4008/// __m128d mask, const int s);
4009/// \endcode
4010///
4011/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4012///
4013/// \param a
4014/// A 128-bit vector of [2 x double] used as the source when a mask bit is
4015/// zero.
4016/// \param m
4017/// A pointer to the memory used for loading values.
4018/// \param i
4019/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4020/// \param mask
4021/// A 128-bit vector of [2 x double] containing the mask. The most
4022/// significant bit of each element in the mask vector represents the mask
4023/// bits. If a mask bit is zero, the corresponding value from vector \a a
4024/// is gathered; otherwise the value is loaded from memory.
4025/// \param s
4026/// A literal constant scale factor for the indexes in \a i. Must be
4027/// 1, 2, 4, or 8.
4028/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4029#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4030 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4031 (double const *)(m), \
4032 (__v2di)(__m128i)(i), \
4033 (__v2df)(__m128d)(mask), (s)))
4034
4035/// Conditionally gathers four 64-bit floating-point values, either from the
4036/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4037/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4038/// of [4 x double] in \a mask determines the source for each element.
4039///
4040/// \code{.operation}
4041/// FOR element := 0 to 3
4042/// j := element*64
4043/// k := element*64
4044/// IF mask[j+63] == 0
4045/// result[j+63:j] := a[j+63:j]
4046/// ELSE
4047/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4048/// FI
4049/// ENDFOR
4050/// \endcode
4051///
4052/// \headerfile <immintrin.h>
4053///
4054/// \code
4055/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4056/// __m256d mask, const int s);
4057/// \endcode
4058///
4059/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4060///
4061/// \param a
4062/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4063/// zero.
4064/// \param m
4065/// A pointer to the memory used for loading values.
4066/// \param i
4067/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4068/// \param mask
4069/// A 256-bit vector of [4 x double] containing the mask. The most
4070/// significant bit of each element in the mask vector represents the mask
4071/// bits. If a mask bit is zero, the corresponding value from vector \a a
4072/// is gathered; otherwise the value is loaded from memory.
4073/// \param s
4074/// A literal constant scale factor for the indexes in \a i. Must be
4075/// 1, 2, 4, or 8.
4076/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4077#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4078 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4079 (double const *)(m), \
4080 (__v4di)(__m256i)(i), \
4081 (__v4df)(__m256d)(mask), (s)))
4082
4083/// Conditionally gathers four 32-bit floating-point values, either from the
4084/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4085/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4086/// of [4 x float] in \a mask determines the source for each element.
4087///
4088/// \code{.operation}
4089/// FOR element := 0 to 3
4090/// j := element*32
4091/// k := element*32
4092/// IF mask[j+31] == 0
4093/// result[j+31:j] := a[j+31:j]
4094/// ELSE
4095/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4096/// FI
4097/// ENDFOR
4098/// \endcode
4099///
4100/// \headerfile <immintrin.h>
4101///
4102/// \code
4103/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4104/// __m128 mask, const int s);
4105/// \endcode
4106///
4107/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4108///
4109/// \param a
4110/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4111/// zero.
4112/// \param m
4113/// A pointer to the memory used for loading values.
4114/// \param i
4115/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4116/// \param mask
4117/// A 128-bit vector of [4 x float] containing the mask. The most
4118/// significant bit of each element in the mask vector represents the mask
4119/// bits. If a mask bit is zero, the corresponding value from vector \a a
4120/// is gathered; otherwise the value is loaded from memory.
4121/// \param s
4122/// A literal constant scale factor for the indexes in \a i. Must be
4123/// 1, 2, 4, or 8.
4124/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4125#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4126 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4127 (float const *)(m), \
4128 (__v4si)(__m128i)(i), \
4129 (__v4sf)(__m128)(mask), (s)))
4130
4131/// Conditionally gathers eight 32-bit floating-point values, either from the
4132/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4133/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4134/// of [8 x float] in \a mask determines the source for each element.
4135///
4136/// \code{.operation}
4137/// FOR element := 0 to 7
4138/// j := element*32
4139/// k := element*32
4140/// IF mask[j+31] == 0
4141/// result[j+31:j] := a[j+31:j]
4142/// ELSE
4143/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4144/// FI
4145/// ENDFOR
4146/// \endcode
4147///
4148/// \headerfile <immintrin.h>
4149///
4150/// \code
4151/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4152/// __m256 mask, const int s);
4153/// \endcode
4154///
4155/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4156///
4157/// \param a
4158/// A 256-bit vector of [8 x float] used as the source when a mask bit is
4159/// zero.
4160/// \param m
4161/// A pointer to the memory used for loading values.
4162/// \param i
4163/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4164/// \param mask
4165/// A 256-bit vector of [8 x float] containing the mask. The most
4166/// significant bit of each element in the mask vector represents the mask
4167/// bits. If a mask bit is zero, the corresponding value from vector \a a
4168/// is gathered; otherwise the value is loaded from memory.
4169/// \param s
4170/// A literal constant scale factor for the indexes in \a i. Must be
4171/// 1, 2, 4, or 8.
4172/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4173#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4174 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4175 (float const *)(m), \
4176 (__v8si)(__m256i)(i), \
4177 (__v8sf)(__m256)(mask), (s)))
4178
4179/// Conditionally gathers two 32-bit floating-point values, either from the
4180/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4181/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4182/// of [4 x float] in \a mask determines the source for the lower two
4183/// elements. The upper two elements of the result are zeroed.
4184///
4185/// \code{.operation}
4186/// FOR element := 0 to 1
4187/// j := element*32
4188/// k := element*64
4189/// IF mask[j+31] == 0
4190/// result[j+31:j] := a[j+31:j]
4191/// ELSE
4192/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4193/// FI
4194/// ENDFOR
4195/// result[127:64] := 0
4196/// \endcode
4197///
4198/// \headerfile <immintrin.h>
4199///
4200/// \code
4201/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4202/// __m128 mask, const int s);
4203/// \endcode
4204///
4205/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4206///
4207/// \param a
4208/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4209/// zero. Only the first two elements are used.
4210/// \param m
4211/// A pointer to the memory used for loading values.
4212/// \param i
4213/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4214/// \param mask
4215/// A 128-bit vector of [4 x float] containing the mask. The most
4216/// significant bit of each element in the mask vector represents the mask
4217/// bits. If a mask bit is zero, the corresponding value from vector \a a
4218/// is gathered; otherwise the value is loaded from memory. Only the first
4219/// two elements are used.
4220/// \param s
4221/// A literal constant scale factor for the indexes in \a i. Must be
4222/// 1, 2, 4, or 8.
4223/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4224#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4225 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4226 (float const *)(m), \
4227 (__v2di)(__m128i)(i), \
4228 (__v4sf)(__m128)(mask), (s)))
4229
4230/// Conditionally gathers four 32-bit floating-point values, either from the
4231/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4232/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4233/// of [4 x float] in \a mask determines the source for each element.
4234///
4235/// \code{.operation}
4236/// FOR element := 0 to 3
4237/// j := element*32
4238/// k := element*64
4239/// IF mask[j+31] == 0
4240/// result[j+31:j] := a[j+31:j]
4241/// ELSE
4242/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4243/// FI
4244/// ENDFOR
4245/// \endcode
4246///
4247/// \headerfile <immintrin.h>
4248///
4249/// \code
4250/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4251/// __m128 mask, const int s);
4252/// \endcode
4253///
4254/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4255///
4256/// \param a
4257/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4258/// zero.
4259/// \param m
4260/// A pointer to the memory used for loading values.
4261/// \param i
4262/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4263/// \param mask
4264/// A 128-bit vector of [4 x float] containing the mask. The most
4265/// significant bit of each element in the mask vector represents the mask
4266/// bits. If a mask bit is zero, the corresponding value from vector \a a
4267/// is gathered; otherwise the value is loaded from memory.
4268/// \param s
4269/// A literal constant scale factor for the indexes in \a i. Must be
4270/// 1, 2, 4, or 8.
4271/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4272#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4273 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4274 (float const *)(m), \
4275 (__v4di)(__m256i)(i), \
4276 (__v4sf)(__m128)(mask), (s)))
4277
4278/// Conditionally gathers four 32-bit integer values, either from the
4279/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4280/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4281/// of [4 x i32] in \a mask determines the source for each element.
4282///
4283/// \code{.operation}
4284/// FOR element := 0 to 3
4285/// j := element*32
4286/// k := element*32
4287/// IF mask[j+31] == 0
4288/// result[j+31:j] := a[j+31:j]
4289/// ELSE
4290/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4291/// FI
4292/// ENDFOR
4293/// \endcode
4294///
4295/// \headerfile <immintrin.h>
4296///
4297/// \code
4298/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4299/// __m128i mask, const int s);
4300/// \endcode
4301///
4302/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4303///
4304/// \param a
4305/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4306/// zero.
4307/// \param m
4308/// A pointer to the memory used for loading values.
4309/// \param i
4310/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4311/// \param mask
4312/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4313/// bit of each element in the mask vector represents the mask bits. If a
4314/// mask bit is zero, the corresponding value from vector \a a is gathered;
4315/// otherwise the value is loaded from memory.
4316/// \param s
4317/// A literal constant scale factor for the indexes in \a i. Must be
4318/// 1, 2, 4, or 8.
4319/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4320#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4321 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4322 (int const *)(m), \
4323 (__v4si)(__m128i)(i), \
4324 (__v4si)(__m128i)(mask), (s)))
4325
4326/// Conditionally gathers eight 32-bit integer values, either from the
4327/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4328/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4329/// of [8 x i32] in \a mask determines the source for each element.
4330///
4331/// \code{.operation}
4332/// FOR element := 0 to 7
4333/// j := element*32
4334/// k := element*32
4335/// IF mask[j+31] == 0
4336/// result[j+31:j] := a[j+31:j]
4337/// ELSE
4338/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4339/// FI
4340/// ENDFOR
4341/// \endcode
4342///
4343/// \headerfile <immintrin.h>
4344///
4345/// \code
4346/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4347/// __m256i mask, const int s);
4348/// \endcode
4349///
4350/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4351///
4352/// \param a
4353/// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4354/// zero.
4355/// \param m
4356/// A pointer to the memory used for loading values.
4357/// \param i
4358/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4359/// \param mask
4360/// A 256-bit vector of [8 x i32] containing the mask. The most significant
4361/// bit of each element in the mask vector represents the mask bits. If a
4362/// mask bit is zero, the corresponding value from vector \a a is gathered;
4363/// otherwise the value is loaded from memory.
4364/// \param s
4365/// A literal constant scale factor for the indexes in \a i. Must be
4366/// 1, 2, 4, or 8.
4367/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4368#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4369 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4370 (int const *)(m), \
4371 (__v8si)(__m256i)(i), \
4372 (__v8si)(__m256i)(mask), (s)))
4373
4374/// Conditionally gathers two 32-bit integer values, either from the
4375/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4376/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4377/// of [4 x i32] in \a mask determines the source for the lower two
4378/// elements. The upper two elements of the result are zeroed.
4379///
4380/// \code{.operation}
4381/// FOR element := 0 to 1
4382/// j := element*32
4383/// k := element*64
4384/// IF mask[j+31] == 0
4385/// result[j+31:j] := a[j+31:j]
4386/// ELSE
4387/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4388/// FI
4389/// ENDFOR
4390/// result[127:64] := 0
4391/// \endcode
4392///
4393/// \headerfile <immintrin.h>
4394///
4395/// \code
4396/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4397/// __m128i mask, const int s);
4398/// \endcode
4399///
4400/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4401///
4402/// \param a
4403/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4404/// zero. Only the first two elements are used.
4405/// \param m
4406/// A pointer to the memory used for loading values.
4407/// \param i
4408/// A 128-bit vector of [2 x i64] containing indexes into \a m.
4409/// \param mask
4410/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4411/// bit of each element in the mask vector represents the mask bits. If a
4412/// mask bit is zero, the corresponding value from vector \a a is gathered;
4413/// otherwise the value is loaded from memory. Only the first two elements
4414/// are used.
4415/// \param s
4416/// A literal constant scale factor for the indexes in \a i. Must be
4417/// 1, 2, 4, or 8.
4418/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4419#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4420 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4421 (int const *)(m), \
4422 (__v2di)(__m128i)(i), \
4423 (__v4si)(__m128i)(mask), (s)))
4424
4425/// Conditionally gathers four 32-bit integer values, either from the
4426/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4427/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4428/// of [4 x i32] in \a mask determines the source for each element.
4429///
4430/// \code{.operation}
4431/// FOR element := 0 to 3
4432/// j := element*32
4433/// k := element*64
4434/// IF mask[j+31] == 0
4435/// result[j+31:j] := a[j+31:j]
4436/// ELSE
4437/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4438/// FI
4439/// ENDFOR
4440/// \endcode
4441///
4442/// \headerfile <immintrin.h>
4443///
4444/// \code
4445/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4446/// __m128i mask, const int s);
4447/// \endcode
4448///
4449/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4450///
4451/// \param a
4452/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4453/// zero.
4454/// \param m
4455/// A pointer to the memory used for loading values.
4456/// \param i
4457/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4458/// \param mask
4459/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4460/// bit of each element in the mask vector represents the mask bits. If a
4461/// mask bit is zero, the corresponding value from vector \a a is gathered;
4462/// otherwise the value is loaded from memory.
4463/// \param s
4464/// A literal constant scale factor for the indexes in \a i. Must be
4465/// 1, 2, 4, or 8.
4466/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4467#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4468 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4469 (int const *)(m), \
4470 (__v4di)(__m256i)(i), \
4471 (__v4si)(__m128i)(mask), (s)))
4472
4473/// Conditionally gathers two 64-bit integer values, either from the
4474/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4475/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4476/// of [2 x i64] in \a mask determines the source for each element.
4477///
4478/// \code{.operation}
4479/// FOR element := 0 to 1
4480/// j := element*64
4481/// k := element*32
4482/// IF mask[j+63] == 0
4483/// result[j+63:j] := a[j+63:j]
4484/// ELSE
4485/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4486/// FI
4487/// ENDFOR
4488/// \endcode
4489///
4490/// \headerfile <immintrin.h>
4491///
4492/// \code
4493/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4494/// __m128i mask, const int s);
4495/// \endcode
4496///
4497/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4498///
4499/// \param a
4500/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4501/// zero.
4502/// \param m
4503/// A pointer to the memory used for loading values.
4504/// \param i
4505/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4506/// the first two elements are used.
4507/// \param mask
4508/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4509/// bit of each element in the mask vector represents the mask bits. If a
4510/// mask bit is zero, the corresponding value from vector \a a is gathered;
4511/// otherwise the value is loaded from memory.
4512/// \param s
4513/// A literal constant scale factor for the indexes in \a i. Must be
4514/// 1, 2, 4, or 8.
4515/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4516#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4517 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4518 (long long const *)(m), \
4519 (__v4si)(__m128i)(i), \
4520 (__v2di)(__m128i)(mask), (s)))
4521
4522/// Conditionally gathers four 64-bit integer values, either from the
4523/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4524/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4525/// of [4 x i64] in \a mask determines the source for each element.
4526///
4527/// \code{.operation}
4528/// FOR element := 0 to 3
4529/// j := element*64
4530/// k := element*32
4531/// IF mask[j+63] == 0
4532/// result[j+63:j] := a[j+63:j]
4533/// ELSE
4534/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4535/// FI
4536/// ENDFOR
4537/// \endcode
4538///
4539/// \headerfile <immintrin.h>
4540///
4541/// \code
4542/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4543/// __m128i i, __m256i mask, const int s);
4544/// \endcode
4545///
4546/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4547///
4548/// \param a
4549/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4550/// zero.
4551/// \param m
4552/// A pointer to the memory used for loading values.
4553/// \param i
4554/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4555/// \param mask
4556/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4557/// bit of each element in the mask vector represents the mask bits. If a
4558/// mask bit is zero, the corresponding value from vector \a a is gathered;
4559/// otherwise the value is loaded from memory.
4560/// \param s
4561/// A literal constant scale factor for the indexes in \a i. Must be
4562/// 1, 2, 4, or 8.
4563/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4564#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4565 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4566 (long long const *)(m), \
4567 (__v4si)(__m128i)(i), \
4568 (__v4di)(__m256i)(mask), (s)))
4569
4570/// Conditionally gathers two 64-bit integer values, either from the
4571/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4572/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4573/// of [2 x i64] in \a mask determines the source for each element.
4574///
4575/// \code{.operation}
4576/// FOR element := 0 to 1
4577/// j := element*64
4578/// k := element*64
4579/// IF mask[j+63] == 0
4580/// result[j+63:j] := a[j+63:j]
4581/// ELSE
4582/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4583/// FI
4584/// ENDFOR
4585/// \endcode
4586///
4587/// \headerfile <immintrin.h>
4588///
4589/// \code
4590/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4591/// __m128i mask, const int s);
4592/// \endcode
4593///
4594/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4595///
4596/// \param a
4597/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4598/// zero.
4599/// \param m
4600/// A pointer to the memory used for loading values.
4601/// \param i
4602/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4603/// \param mask
4604/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4605/// bit of each element in the mask vector represents the mask bits. If a
4606/// mask bit is zero, the corresponding value from vector \a a is gathered;
4607/// otherwise the value is loaded from memory.
4608/// \param s
4609/// A literal constant scale factor for the indexes in \a i. Must be
4610/// 1, 2, 4, or 8.
4611/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4612#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4613 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4614 (long long const *)(m), \
4615 (__v2di)(__m128i)(i), \
4616 (__v2di)(__m128i)(mask), (s)))
4617
4618/// Conditionally gathers four 64-bit integer values, either from the
4619/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4620/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4621/// of [4 x i64] in \a mask determines the source for each element.
4622///
4623/// \code{.operation}
4624/// FOR element := 0 to 3
4625/// j := element*64
4626/// k := element*64
4627/// IF mask[j+63] == 0
4628/// result[j+63:j] := a[j+63:j]
4629/// ELSE
4630/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4631/// FI
4632/// ENDFOR
4633/// \endcode
4634///
4635/// \headerfile <immintrin.h>
4636///
4637/// \code
4638/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4639/// __m256i i, __m256i mask, const int s);
4640/// \endcode
4641///
4642/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4643///
4644/// \param a
4645/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4646/// zero.
4647/// \param m
4648/// A pointer to the memory used for loading values.
4649/// \param i
4650/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4651/// \param mask
4652/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4653/// bit of each element in the mask vector represents the mask bits. If a
4654/// mask bit is zero, the corresponding value from vector \a a is gathered;
4655/// otherwise the value is loaded from memory.
4656/// \param s
4657/// A literal constant scale factor for the indexes in \a i. Must be
4658/// 1, 2, 4, or 8.
4659/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4660#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4661 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4662 (long long const *)(m), \
4663 (__v4di)(__m256i)(i), \
4664 (__v4di)(__m256i)(mask), (s)))
4665
4666/// Gathers two 64-bit floating-point values from memory \a m using scaled
4667/// indexes from the 128-bit vector of [4 x i32] in \a i.
4668///
4669/// \code{.operation}
4670/// FOR element := 0 to 1
4671/// j := element*64
4672/// k := element*32
4673/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4674/// ENDFOR
4675/// \endcode
4676///
4677/// \headerfile <immintrin.h>
4678///
4679/// \code
4680/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4681/// \endcode
4682///
4683/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4684///
4685/// \param m
4686/// A pointer to the memory used for loading values.
4687/// \param i
4688/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4689/// the first two elements are used.
4690/// \param s
4691/// A literal constant scale factor for the indexes in \a i. Must be
4692/// 1, 2, 4, or 8.
4693/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4694#define _mm_i32gather_pd(m, i, s) \
4695 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4696 (double const *)(m), \
4697 (__v4si)(__m128i)(i), \
4698 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4699 _mm_setzero_pd()), \
4700 (s)))
4701
4702/// Gathers four 64-bit floating-point values from memory \a m using scaled
4703/// indexes from the 128-bit vector of [4 x i32] in \a i.
4704///
4705/// \code{.operation}
4706/// FOR element := 0 to 3
4707/// j := element*64
4708/// k := element*32
4709/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4710/// ENDFOR
4711/// \endcode
4712///
4713/// \headerfile <immintrin.h>
4714///
4715/// \code
4716/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4717/// \endcode
4718///
4719/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4720///
4721/// \param m
4722/// A pointer to the memory used for loading values.
4723/// \param i
4724/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4725/// \param s
4726/// A literal constant scale factor for the indexes in \a i. Must be
4727/// 1, 2, 4, or 8.
4728/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4729#define _mm256_i32gather_pd(m, i, s) \
4730 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4731 (double const *)(m), \
4732 (__v4si)(__m128i)(i), \
4733 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4734 _mm256_setzero_pd(), \
4735 _CMP_EQ_OQ), \
4736 (s)))
4737
4738/// Gathers two 64-bit floating-point values from memory \a m using scaled
4739/// indexes from the 128-bit vector of [2 x i64] in \a i.
4740///
4741/// \code{.operation}
4742/// FOR element := 0 to 1
4743/// j := element*64
4744/// k := element*64
4745/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4746/// ENDFOR
4747/// \endcode
4748///
4749/// \headerfile <immintrin.h>
4750///
4751/// \code
4752/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4753/// \endcode
4754///
4755/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4756///
4757/// \param m
4758/// A pointer to the memory used for loading values.
4759/// \param i
4760/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4761/// \param s
4762/// A literal constant scale factor for the indexes in \a i. Must be
4763/// 1, 2, 4, or 8.
4764/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4765#define _mm_i64gather_pd(m, i, s) \
4766 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4767 (double const *)(m), \
4768 (__v2di)(__m128i)(i), \
4769 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4770 _mm_setzero_pd()), \
4771 (s)))
4772
4773/// Gathers four 64-bit floating-point values from memory \a m using scaled
4774/// indexes from the 256-bit vector of [4 x i64] in \a i.
4775///
4776/// \code{.operation}
4777/// FOR element := 0 to 3
4778/// j := element*64
4779/// k := element*64
4780/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4781/// ENDFOR
4782/// \endcode
4783///
4784/// \headerfile <immintrin.h>
4785///
4786/// \code
4787/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4788/// \endcode
4789///
4790/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4791///
4792/// \param m
4793/// A pointer to the memory used for loading values.
4794/// \param i
4795/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4796/// \param s
4797/// A literal constant scale factor for the indexes in \a i. Must be
4798/// 1, 2, 4, or 8.
4799/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4800#define _mm256_i64gather_pd(m, i, s) \
4801 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4802 (double const *)(m), \
4803 (__v4di)(__m256i)(i), \
4804 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4805 _mm256_setzero_pd(), \
4806 _CMP_EQ_OQ), \
4807 (s)))
4808
4809/// Gathers four 32-bit floating-point values from memory \a m using scaled
4810/// indexes from the 128-bit vector of [4 x i32] in \a i.
4811///
4812/// \code{.operation}
4813/// FOR element := 0 to 3
4814/// j := element*32
4815/// k := element*32
4816/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4817/// ENDFOR
4818/// \endcode
4819///
4820/// \headerfile <immintrin.h>
4821///
4822/// \code
4823/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4824/// \endcode
4825///
4826/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4827///
4828/// \param m
4829/// A pointer to the memory used for loading values.
4830/// \param i
4831/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4832/// \param s
4833/// A literal constant scale factor for the indexes in \a i. Must be
4834/// 1, 2, 4, or 8.
4835/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4836#define _mm_i32gather_ps(m, i, s) \
4837 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4838 (float const *)(m), \
4839 (__v4si)(__m128i)(i), \
4840 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4841 _mm_setzero_ps()), \
4842 (s)))
4843
4844/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4845/// indexes from the 256-bit vector of [8 x i32] in \a i.
4846///
4847/// \code{.operation}
4848/// FOR element := 0 to 7
4849/// j := element*32
4850/// k := element*32
4851/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4852/// ENDFOR
4853/// \endcode
4854///
4855/// \headerfile <immintrin.h>
4856///
4857/// \code
4858/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4859/// \endcode
4860///
4861/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4862///
4863/// \param m
4864/// A pointer to the memory used for loading values.
4865/// \param i
4866/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4867/// \param s
4868/// A literal constant scale factor for the indexes in \a i. Must be
4869/// 1, 2, 4, or 8.
4870/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4871#define _mm256_i32gather_ps(m, i, s) \
4872 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4873 (float const *)(m), \
4874 (__v8si)(__m256i)(i), \
4875 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4876 _mm256_setzero_ps(), \
4877 _CMP_EQ_OQ), \
4878 (s)))
4879
4880/// Gathers two 32-bit floating-point values from memory \a m using scaled
4881/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4882/// elements of the result are zeroed.
4883///
4884/// \code{.operation}
4885/// FOR element := 0 to 1
4886/// j := element*32
4887/// k := element*64
4888/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4889/// ENDFOR
4890/// result[127:64] := 0
4891/// \endcode
4892///
4893/// \headerfile <immintrin.h>
4894///
4895/// \code
4896/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4897/// \endcode
4898///
4899/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4900///
4901/// \param m
4902/// A pointer to the memory used for loading values.
4903/// \param i
4904/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4905/// \param s
4906/// A literal constant scale factor for the indexes in \a i. Must be
4907/// 1, 2, 4, or 8.
4908/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4909#define _mm_i64gather_ps(m, i, s) \
4910 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4911 (float const *)(m), \
4912 (__v2di)(__m128i)(i), \
4913 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4914 _mm_setzero_ps()), \
4915 (s)))
4916
4917/// Gathers four 32-bit floating-point values from memory \a m using scaled
4918/// indexes from the 256-bit vector of [4 x i64] in \a i.
4919///
4920/// \code{.operation}
4921/// FOR element := 0 to 3
4922/// j := element*32
4923/// k := element*64
4924/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4925/// ENDFOR
4926/// \endcode
4927///
4928/// \headerfile <immintrin.h>
4929///
4930/// \code
4931/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4932/// \endcode
4933///
4934/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4935///
4936/// \param m
4937/// A pointer to the memory used for loading values.
4938/// \param i
4939/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4940/// \param s
4941/// A literal constant scale factor for the indexes in \a i. Must be
4942/// 1, 2, 4, or 8.
4943/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4944#define _mm256_i64gather_ps(m, i, s) \
4945 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
4946 (float const *)(m), \
4947 (__v4di)(__m256i)(i), \
4948 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4949 _mm_setzero_ps()), \
4950 (s)))
4951
4952/// Gathers four 32-bit floating-point values from memory \a m using scaled
4953/// indexes from the 128-bit vector of [4 x i32] in \a i.
4954///
4955/// \code{.operation}
4956/// FOR element := 0 to 3
4957/// j := element*32
4958/// k := element*32
4959/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4960/// ENDFOR
4961/// \endcode
4962///
4963/// \headerfile <immintrin.h>
4964///
4965/// \code
4966/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
4967/// \endcode
4968///
4969/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4970///
4971/// \param m
4972/// A pointer to the memory used for loading values.
4973/// \param i
4974/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4975/// \param s
4976/// A literal constant scale factor for the indexes in \a i. Must be
4977/// 1, 2, 4, or 8.
4978/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4979#define _mm_i32gather_epi32(m, i, s) \
4980 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
4981 (int const *)(m), (__v4si)(__m128i)(i), \
4982 (__v4si)_mm_set1_epi32(-1), (s)))
4983
4984/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4985/// indexes from the 256-bit vector of [8 x i32] in \a i.
4986///
4987/// \code{.operation}
4988/// FOR element := 0 to 7
4989/// j := element*32
4990/// k := element*32
4991/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4992/// ENDFOR
4993/// \endcode
4994///
4995/// \headerfile <immintrin.h>
4996///
4997/// \code
4998/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
4999/// \endcode
5000///
5001/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5002///
5003/// \param m
5004/// A pointer to the memory used for loading values.
5005/// \param i
5006/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5007/// \param s
5008/// A literal constant scale factor for the indexes in \a i. Must be
5009/// 1, 2, 4, or 8.
5010/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5011#define _mm256_i32gather_epi32(m, i, s) \
5012 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5013 (int const *)(m), (__v8si)(__m256i)(i), \
5014 (__v8si)_mm256_set1_epi32(-1), (s)))
5015
5016/// Gathers two 32-bit integer values from memory \a m using scaled indexes
5017/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5018/// of the result are zeroed.
5019///
5020/// \code{.operation}
5021/// FOR element := 0 to 1
5022/// j := element*32
5023/// k := element*64
5024/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5025/// ENDFOR
5026/// result[127:64] := 0
5027/// \endcode
5028///
5029/// \headerfile <immintrin.h>
5030///
5031/// \code
5032/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5033/// \endcode
5034///
5035/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5036///
5037/// \param m
5038/// A pointer to the memory used for loading values.
5039/// \param i
5040/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5041/// \param s
5042/// A literal constant scale factor for the indexes in \a i. Must be
5043/// 1, 2, 4, or 8.
5044/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5045#define _mm_i64gather_epi32(m, i, s) \
5046 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5047 (int const *)(m), (__v2di)(__m128i)(i), \
5048 (__v4si)_mm_set1_epi32(-1), (s)))
5049
5050/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5051/// from the 256-bit vector of [4 x i64] in \a i.
5052///
5053/// \code{.operation}
5054/// FOR element := 0 to 3
5055/// j := element*32
5056/// k := element*64
5057/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5058/// ENDFOR
5059/// \endcode
5060///
5061/// \headerfile <immintrin.h>
5062///
5063/// \code
5064/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5065/// \endcode
5066///
5067/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5068///
5069/// \param m
5070/// A pointer to the memory used for loading values.
5071/// \param i
5072/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5073/// \param s
5074/// A literal constant scale factor for the indexes in \a i. Must be
5075/// 1, 2, 4, or 8.
5076/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5077#define _mm256_i64gather_epi32(m, i, s) \
5078 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5079 (int const *)(m), (__v4di)(__m256i)(i), \
5080 (__v4si)_mm_set1_epi32(-1), (s)))
5081
5082/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5083/// from the 128-bit vector of [4 x i32] in \a i.
5084///
5085/// \code{.operation}
5086/// FOR element := 0 to 1
5087/// j := element*64
5088/// k := element*32
5089/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5090/// ENDFOR
5091/// \endcode
5092///
5093/// \headerfile <immintrin.h>
5094///
5095/// \code
5096/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5097/// \endcode
5098///
5099/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5100///
5101/// \param m
5102/// A pointer to the memory used for loading values.
5103/// \param i
5104/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5105/// the first two elements are used.
5106/// \param s
5107/// A literal constant scale factor for the indexes in \a i. Must be
5108/// 1, 2, 4, or 8.
5109/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5110#define _mm_i32gather_epi64(m, i, s) \
5111 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5112 (long long const *)(m), \
5113 (__v4si)(__m128i)(i), \
5114 (__v2di)_mm_set1_epi64x(-1), (s)))
5115
5116/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5117/// from the 128-bit vector of [4 x i32] in \a i.
5118///
5119/// \code{.operation}
5120/// FOR element := 0 to 3
5121/// j := element*64
5122/// k := element*32
5123/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5124/// ENDFOR
5125/// \endcode
5126///
5127/// \headerfile <immintrin.h>
5128///
5129/// \code
5130/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5131/// \endcode
5132///
5133/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5134///
5135/// \param m
5136/// A pointer to the memory used for loading values.
5137/// \param i
5138/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5139/// \param s
5140/// A literal constant scale factor for the indexes in \a i. Must be
5141/// 1, 2, 4, or 8.
5142/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5143#define _mm256_i32gather_epi64(m, i, s) \
5144 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5145 (long long const *)(m), \
5146 (__v4si)(__m128i)(i), \
5147 (__v4di)_mm256_set1_epi64x(-1), (s)))
5148
5149/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5150/// from the 128-bit vector of [2 x i64] in \a i.
5151///
5152/// \code{.operation}
5153/// FOR element := 0 to 1
5154/// j := element*64
5155/// k := element*64
5156/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5157/// ENDFOR
5158/// \endcode
5159///
5160/// \headerfile <immintrin.h>
5161///
5162/// \code
5163/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5164/// \endcode
5165///
5166/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5167///
5168/// \param m
5169/// A pointer to the memory used for loading values.
5170/// \param i
5171/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5172/// \param s
5173/// A literal constant scale factor for the indexes in \a i. Must be
5174/// 1, 2, 4, or 8.
5175/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5176#define _mm_i64gather_epi64(m, i, s) \
5177 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5178 (long long const *)(m), \
5179 (__v2di)(__m128i)(i), \
5180 (__v2di)_mm_set1_epi64x(-1), (s)))
5181
5182/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5183/// from the 256-bit vector of [4 x i64] in \a i.
5184///
5185/// \code{.operation}
5186/// FOR element := 0 to 3
5187/// j := element*64
5188/// k := element*64
5189/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5190/// ENDFOR
5191/// \endcode
5192///
5193/// \headerfile <immintrin.h>
5194///
5195/// \code
5196/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5197/// \endcode
5198///
5199/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5200///
5201/// \param m
5202/// A pointer to the memory used for loading values.
5203/// \param i
5204/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5205/// \param s
5206/// A literal constant scale factor for the indexes in \a i. Must be
5207/// 1, 2, 4, or 8.
5208/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5209#define _mm256_i64gather_epi64(m, i, s) \
5210 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5211 (long long const *)(m), \
5212 (__v4di)(__m256i)(i), \
5213 (__v4di)_mm256_set1_epi64x(-1), (s)))
5214
5215#undef __DEFAULT_FN_ATTRS256
5216#undef __DEFAULT_FN_ATTRS128
5217#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
5218#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5219
5220#endif /* __AVX2INTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
Definition altivec.h:578
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi32_epi64(__m128i __V)
Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and re...
Definition avx2intrin.h:842
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256(const void *__V)
Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vec...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than an...
Definition avx2intrin.h:726
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epu32(__m256i __a, __m256i __b)
Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srav_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_andnot_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit int...
Definition avx2intrin.h:470
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation...
Definition avx2intrin.h:390
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi32(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation,...
Definition avx2intrin.h:201
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64(long long const *__X, __m256i __M)
Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to mem...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epu16(__m256i __a, __m256i __b)
Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upp...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256...
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastsd_pd(__m128d __a)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32(int const *__X, __m256i __M)
Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi16(__m128i __V)
Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsi...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi32(__m256i __a)
Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a a...
Definition avx2intrin.h:139
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed by...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and r...
Definition avx2intrin.h:945
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srav_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi64(__m128i __V)
Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and r...
Definition avx2intrin.h:674
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed sa...
Definition avx2intrin.h:372
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits spec...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memor...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu32_epi64(__m128i __V)
Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu8_epi32(__m128i __V)
Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x f...
static __inline__ int __DEFAULT_FN_ATTRS256 _mm256_movemask_epi8(__m256i __a)
Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vecto...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi16(__m128i __V)
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the ...
Definition avx2intrin.h:555
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and r...
Definition avx2intrin.h:700
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32(int const *__X, __m128i __M)
Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadds_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using ...
Definition avx2intrin.h:909
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and re...
Definition avx2intrin.h:977
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a, __m256i __b)
Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greate...
Definition avx2intrin.h:754
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 b...
Definition avx2intrin.h:283
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi32(__m128i __V)
Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result'...
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mul_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi16(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation,...
Definition avx2intrin.h:169
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater...
Definition avx2intrin.h:780
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi32(__m256i __a, __m256i __b)
Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epu8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned satur...
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using sign...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns t...
Definition avx2intrin.h:622
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi8(__m256i __a)
Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each va...
Definition avx2intrin.h:107
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi64(__m128i __V)
Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16(__m256i __a, __m256i __b)
Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit interme...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi64(__m128i __V)
Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation,...
Definition avx2intrin.h:264
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi64(__m256i __a, __m256i __b)
Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the ...
Definition avx2intrin.h:337
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi64(__m128i __V)
Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_and_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vectors in __a and __b.
Definition avx2intrin.h:452
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsi128_si256(__m128i __X)
Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srai_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi32(__m256i __a, __m256i __b)
Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the ...
Definition avx2intrin.h:319
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi16(__m256i __a)
Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a ...
Definition avx2intrin.h:123
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_add_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the...
Definition avx2intrin.h:301
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_or_si256(__m256i __a, __m256i __b)
Computes the bitwise OR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi16(__m256i __a, __m256i __b)
Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation,...
Definition avx2intrin.h:232
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastsd_pd(__m128d __X)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepu16_epi32(__m128i __V)
Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu16(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16...
Definition avx2intrin.h:521
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_xor_si256(__m256i __a, __m256i __b)
Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_min_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi16_epi32(__m128i __V)
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epu16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned ...
Definition avx2intrin.h:407
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64(long long const *__X, __m128i __M)
Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the c...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in _...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sub_epi64(__m256i __a, __m256i __b)
Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and ret...
Definition avx2intrin.h:874
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_max_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits give...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_subs_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturat...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater...
Definition avx2intrin.h:806
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a, __m256i __b)
Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers fr...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_adds_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation,...
Definition avx2intrin.h:355
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and ...
Definition avx2intrin.h:648
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_avg_epu8(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a a...
Definition avx2intrin.h:496
static __inline__ void int __a
Definition emmintrin.h:4077
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19