clang 20.0.0git
avx2intrin.h
Go to the documentation of this file.
1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX2INTRIN_H
15#define __AVX2INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx2,no-evex512"), __min_vector_width__(256)))
21#define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("avx2,no-evex512"), __min_vector_width__(128)))
24
25/* SSE4 Multiple Packed Sums of Absolute Difference. */
26/// Computes sixteen sum of absolute difference (SAD) operations on sets of
27/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
28/// \a Y.
29///
30/// Eight SAD results are computed using the lower half of the input
31/// vectors, and another eight using the upper half. These 16-bit values
32/// are returned in the lower and upper halves of the 256-bit result,
33/// respectively.
34///
35/// A single SAD operation selects four bytes from \a X and four bytes from
36/// \a Y as input. It computes the differences between each \a X byte and
37/// the corresponding \a Y byte, takes the absolute value of each
38/// difference, and sums these four values to form one 16-bit result. The
39/// intrinsic computes 16 of these results with different sets of input
40/// bytes.
41///
42/// For each set of eight results, the SAD operations use the same four
43/// bytes from \a Y; the starting bit position for these four bytes is
44/// specified by \a M[1:0] times 32. The eight operations use successive
45/// sets of four bytes from \a X; the starting bit position for the first
46/// set of four bytes is specified by \a M[2] times 32. These bit positions
47/// are all relative to the 128-bit lane for each set of eight operations.
48///
49/// \code{.operation}
50/// r := 0
51/// FOR i := 0 TO 1
52/// j := i*3
53/// Ybase := M[j+1:j]*32 + i*128
54/// Xbase := M[j+2]*32 + i*128
55/// FOR k := 0 TO 3
56/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
57/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
58/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
59/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
60/// result[r+15:r] := temp0 + temp1 + temp2 + temp3
61/// Xbase := Xbase + 8
62/// r := r + 16
63/// ENDFOR
64/// ENDFOR
65/// \endcode
66///
67/// \headerfile <immintrin.h>
68///
69/// \code
70/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
71/// \endcode
72///
73/// This intrinsic corresponds to the \c VMPSADBW instruction.
74///
75/// \param X
76/// A 256-bit integer vector containing one of the inputs.
77/// \param Y
78/// A 256-bit integer vector containing one of the inputs.
79/// \param M
80/// An unsigned immediate value specifying the starting positions of the
81/// bytes to operate on.
82/// \returns A 256-bit vector of [16 x i16] containing the result.
83#define _mm256_mpsadbw_epu8(X, Y, M) \
84 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
85 (__v32qi)(__m256i)(Y), (int)(M)))
86
87/// Computes the absolute value of each signed byte in the 256-bit integer
88/// vector \a __a and returns each value in the corresponding byte of
89/// the result.
90///
91/// \headerfile <immintrin.h>
92///
93/// This intrinsic corresponds to the \c VPABSB instruction.
94///
95/// \param __a
96/// A 256-bit integer vector.
97/// \returns A 256-bit integer vector containing the result.
98static __inline__ __m256i __DEFAULT_FN_ATTRS256
100{
101 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
102}
103
104/// Computes the absolute value of each signed 16-bit element in the 256-bit
105/// vector of [16 x i16] in \a __a and returns each value in the
106/// corresponding element of the result.
107///
108/// \headerfile <immintrin.h>
109///
110/// This intrinsic corresponds to the \c VPABSW instruction.
111///
112/// \param __a
113/// A 256-bit vector of [16 x i16].
114/// \returns A 256-bit vector of [16 x i16] containing the result.
115static __inline__ __m256i __DEFAULT_FN_ATTRS256
117{
118 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
119}
120
121/// Computes the absolute value of each signed 32-bit element in the 256-bit
122/// vector of [8 x i32] in \a __a and returns each value in the
123/// corresponding element of the result.
124///
125/// \headerfile <immintrin.h>
126///
127/// This intrinsic corresponds to the \c VPABSD instruction.
128///
129/// \param __a
130/// A 256-bit vector of [8 x i32].
131/// \returns A 256-bit vector of [8 x i32] containing the result.
132static __inline__ __m256i __DEFAULT_FN_ATTRS256
134{
135 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
136}
137
138/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
139/// integers using signed saturation, and returns the 256-bit result.
140///
141/// \code{.operation}
142/// FOR i := 0 TO 7
143/// j := i*16
144/// k := i*8
145/// result[7+k:k] := SATURATE8(__a[15+j:j])
146/// result[71+k:64+k] := SATURATE8(__b[15+j:j])
147/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
148/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
149/// ENDFOR
150/// \endcode
151///
152/// \headerfile <immintrin.h>
153///
154/// This intrinsic corresponds to the \c VPACKSSWB instruction.
155///
156/// \param __a
157/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
158/// result[191:128].
159/// \param __b
160/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
161/// result[255:192].
162/// \returns A 256-bit integer vector containing the result.
163static __inline__ __m256i __DEFAULT_FN_ATTRS256
164_mm256_packs_epi16(__m256i __a, __m256i __b)
165{
166 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
167}
168
169/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
170/// integers using signed saturation, and returns the resulting 256-bit
171/// vector of [16 x i16].
172///
173/// \code{.operation}
174/// FOR i := 0 TO 3
175/// j := i*32
176/// k := i*16
177/// result[15+k:k] := SATURATE16(__a[31+j:j])
178/// result[79+k:64+k] := SATURATE16(__b[31+j:j])
179/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
180/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
181/// ENDFOR
182/// \endcode
183///
184/// \headerfile <immintrin.h>
185///
186/// This intrinsic corresponds to the \c VPACKSSDW instruction.
187///
188/// \param __a
189/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
190/// result[191:128].
191/// \param __b
192/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
193/// result[255:192].
194/// \returns A 256-bit vector of [16 x i16] containing the result.
195static __inline__ __m256i __DEFAULT_FN_ATTRS256
196_mm256_packs_epi32(__m256i __a, __m256i __b)
197{
198 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
199}
200
201/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
202/// using unsigned saturation, and returns the 256-bit result.
203///
204/// \code{.operation}
205/// FOR i := 0 TO 7
206/// j := i*16
207/// k := i*8
208/// result[7+k:k] := SATURATE8U(__a[15+j:j])
209/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
210/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
211/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
212/// ENDFOR
213/// \endcode
214///
215/// \headerfile <immintrin.h>
216///
217/// This intrinsic corresponds to the \c VPACKUSWB instruction.
218///
219/// \param __a
220/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
221/// result[191:128].
222/// \param __b
223/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
224/// result[255:192].
225/// \returns A 256-bit integer vector containing the result.
226static __inline__ __m256i __DEFAULT_FN_ATTRS256
227_mm256_packus_epi16(__m256i __a, __m256i __b)
228{
229 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
230}
231
232/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
233/// using unsigned saturation, and returns the resulting 256-bit vector of
234/// [16 x i16].
235///
236/// \code{.operation}
237/// FOR i := 0 TO 3
238/// j := i*32
239/// k := i*16
240/// result[15+k:k] := SATURATE16U(__V1[31+j:j])
241/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
242/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
243/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
244/// ENDFOR
245/// \endcode
246///
247/// \headerfile <immintrin.h>
248///
249/// This intrinsic corresponds to the \c VPACKUSDW instruction.
250///
251/// \param __V1
252/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
253/// result[191:128].
254/// \param __V2
255/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
256/// result[255:192].
257/// \returns A 256-bit vector of [16 x i16] containing the result.
258static __inline__ __m256i __DEFAULT_FN_ATTRS256
259_mm256_packus_epi32(__m256i __V1, __m256i __V2)
260{
261 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
262}
263
264/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
265/// vectors and returns the lower 8 bits of each sum in the corresponding
266/// byte of the 256-bit integer vector result (overflow is ignored).
267///
268/// \headerfile <immintrin.h>
269///
270/// This intrinsic corresponds to the \c VPADDB instruction.
271///
272/// \param __a
273/// A 256-bit integer vector containing one of the source operands.
274/// \param __b
275/// A 256-bit integer vector containing one of the source operands.
276/// \returns A 256-bit integer vector containing the sums.
277static __inline__ __m256i __DEFAULT_FN_ATTRS256
278_mm256_add_epi8(__m256i __a, __m256i __b)
279{
280 return (__m256i)((__v32qu)__a + (__v32qu)__b);
281}
282
283/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284/// [16 x i16] and returns the lower 16 bits of each sum in the
285/// corresponding element of the [16 x i16] result (overflow is ignored).
286///
287/// \headerfile <immintrin.h>
288///
289/// This intrinsic corresponds to the \c VPADDW instruction.
290///
291/// \param __a
292/// A 256-bit vector of [16 x i16] containing one of the source operands.
293/// \param __b
294/// A 256-bit vector of [16 x i16] containing one of the source operands.
295/// \returns A 256-bit vector of [16 x i16] containing the sums.
296static __inline__ __m256i __DEFAULT_FN_ATTRS256
297_mm256_add_epi16(__m256i __a, __m256i __b)
298{
299 return (__m256i)((__v16hu)__a + (__v16hu)__b);
300}
301
302/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
303/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
304/// element of the [8 x i32] result (overflow is ignored).
305///
306/// \headerfile <immintrin.h>
307///
308/// This intrinsic corresponds to the \c VPADDD instruction.
309///
310/// \param __a
311/// A 256-bit vector of [8 x i32] containing one of the source operands.
312/// \param __b
313/// A 256-bit vector of [8 x i32] containing one of the source operands.
314/// \returns A 256-bit vector of [8 x i32] containing the sums.
315static __inline__ __m256i __DEFAULT_FN_ATTRS256
316_mm256_add_epi32(__m256i __a, __m256i __b)
317{
318 return (__m256i)((__v8su)__a + (__v8su)__b);
319}
320
321/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
322/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
323/// element of the [4 x i64] result (overflow is ignored).
324///
325/// \headerfile <immintrin.h>
326///
327/// This intrinsic corresponds to the \c VPADDQ instruction.
328///
329/// \param __a
330/// A 256-bit vector of [4 x i64] containing one of the source operands.
331/// \param __b
332/// A 256-bit vector of [4 x i64] containing one of the source operands.
333/// \returns A 256-bit vector of [4 x i64] containing the sums.
334static __inline__ __m256i __DEFAULT_FN_ATTRS256
335_mm256_add_epi64(__m256i __a, __m256i __b)
336{
337 return (__m256i)((__v4du)__a + (__v4du)__b);
338}
339
340/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
341/// vectors using signed saturation, and returns each sum in the
342/// corresponding byte of the 256-bit integer vector result.
343///
344/// \headerfile <immintrin.h>
345///
346/// This intrinsic corresponds to the \c VPADDSB instruction.
347///
348/// \param __a
349/// A 256-bit integer vector containing one of the source operands.
350/// \param __b
351/// A 256-bit integer vector containing one of the source operands.
352/// \returns A 256-bit integer vector containing the sums.
353static __inline__ __m256i __DEFAULT_FN_ATTRS256
354_mm256_adds_epi8(__m256i __a, __m256i __b)
355{
356 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
357}
358
359/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
360/// [16 x i16] using signed saturation, and returns the [16 x i16] result.
361///
362/// \headerfile <immintrin.h>
363///
364/// This intrinsic corresponds to the \c VPADDSW instruction.
365///
366/// \param __a
367/// A 256-bit vector of [16 x i16] containing one of the source operands.
368/// \param __b
369/// A 256-bit vector of [16 x i16] containing one of the source operands.
370/// \returns A 256-bit vector of [16 x i16] containing the sums.
371static __inline__ __m256i __DEFAULT_FN_ATTRS256
372_mm256_adds_epi16(__m256i __a, __m256i __b)
373{
374 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
375}
376
377/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
378/// vectors using unsigned saturation, and returns each sum in the
379/// corresponding byte of the 256-bit integer vector result.
380///
381/// \headerfile <immintrin.h>
382///
383/// This intrinsic corresponds to the \c VPADDUSB instruction.
384///
385/// \param __a
386/// A 256-bit integer vector containing one of the source operands.
387/// \param __b
388/// A 256-bit integer vector containing one of the source operands.
389/// \returns A 256-bit integer vector containing the sums.
390static __inline__ __m256i __DEFAULT_FN_ATTRS256
391_mm256_adds_epu8(__m256i __a, __m256i __b)
392{
393 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
394}
395
396/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
397/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
398///
399/// \headerfile <immintrin.h>
400///
401/// This intrinsic corresponds to the \c VPADDUSW instruction.
402///
403/// \param __a
404/// A 256-bit vector of [16 x i16] containing one of the source operands.
405/// \param __b
406/// A 256-bit vector of [16 x i16] containing one of the source operands.
407/// \returns A 256-bit vector of [16 x i16] containing the sums.
408static __inline__ __m256i __DEFAULT_FN_ATTRS256
409_mm256_adds_epu16(__m256i __a, __m256i __b)
410{
411 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
412}
413
414/// Uses the lower half of the 256-bit vector \a a as the upper half of a
415/// temporary 256-bit value, and the lower half of the 256-bit vector \a b
416/// as the lower half of the temporary value. Right-shifts the temporary
417/// value by \a n bytes, and uses the lower 16 bytes of the shifted value
418/// as the lower 16 bytes of the result. Uses the upper halves of \a a and
419/// \a b to make another temporary value, right shifts by \a n, and uses
420/// the lower 16 bytes of the shifted value as the upper 16 bytes of the
421/// result.
422///
423/// \headerfile <immintrin.h>
424///
425/// \code
426/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
427/// \endcode
428///
429/// This intrinsic corresponds to the \c VPALIGNR instruction.
430///
431/// \param a
432/// A 256-bit integer vector containing source values.
433/// \param b
434/// A 256-bit integer vector containing source values.
435/// \param n
436/// An immediate value specifying the number of bytes to shift.
437/// \returns A 256-bit integer vector containing the result.
438#define _mm256_alignr_epi8(a, b, n) \
439 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
440 (__v32qi)(__m256i)(b), (n)))
441
442/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
443/// \a __b.
444///
445/// \headerfile <immintrin.h>
446///
447/// This intrinsic corresponds to the \c VPAND instruction.
448///
449/// \param __a
450/// A 256-bit integer vector.
451/// \param __b
452/// A 256-bit integer vector.
453/// \returns A 256-bit integer vector containing the result.
454static __inline__ __m256i __DEFAULT_FN_ATTRS256
455_mm256_and_si256(__m256i __a, __m256i __b)
456{
457 return (__m256i)((__v4du)__a & (__v4du)__b);
458}
459
460/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
461/// the bitwise NOT of the 256-bit integer vector in \a __a.
462///
463/// \headerfile <immintrin.h>
464///
465/// This intrinsic corresponds to the \c VPANDN instruction.
466///
467/// \param __a
468/// A 256-bit integer vector.
469/// \param __b
470/// A 256-bit integer vector.
471/// \returns A 256-bit integer vector containing the result.
472static __inline__ __m256i __DEFAULT_FN_ATTRS256
473_mm256_andnot_si256(__m256i __a, __m256i __b)
474{
475 return (__m256i)(~(__v4du)__a & (__v4du)__b);
476}
477
478/// Computes the averages of the corresponding unsigned bytes in the two
479/// 256-bit integer vectors in \a __a and \a __b and returns each
480/// average in the corresponding byte of the 256-bit result.
481///
482/// \code{.operation}
483/// FOR i := 0 TO 31
484/// j := i*8
485/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
486/// ENDFOR
487/// \endcode
488///
489/// \headerfile <immintrin.h>
490///
491/// This intrinsic corresponds to the \c VPAVGB instruction.
492///
493/// \param __a
494/// A 256-bit integer vector.
495/// \param __b
496/// A 256-bit integer vector.
497/// \returns A 256-bit integer vector containing the result.
498static __inline__ __m256i __DEFAULT_FN_ATTRS256
499_mm256_avg_epu8(__m256i __a, __m256i __b)
500{
501 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
502}
503
504/// Computes the averages of the corresponding unsigned 16-bit integers in
505/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
506/// each average in the corresponding element of the 256-bit result.
507///
508/// \code{.operation}
509/// FOR i := 0 TO 15
510/// j := i*16
511/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
512/// ENDFOR
513/// \endcode
514///
515/// \headerfile <immintrin.h>
516///
517/// This intrinsic corresponds to the \c VPAVGW instruction.
518///
519/// \param __a
520/// A 256-bit vector of [16 x i16].
521/// \param __b
522/// A 256-bit vector of [16 x i16].
523/// \returns A 256-bit vector of [16 x i16] containing the result.
524static __inline__ __m256i __DEFAULT_FN_ATTRS256
525_mm256_avg_epu16(__m256i __a, __m256i __b)
526{
527 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
528}
529
530/// Merges 8-bit integer values from either of the two 256-bit vectors
531/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
532/// the resulting 256-bit integer vector.
533///
534/// \code{.operation}
535/// FOR i := 0 TO 31
536/// j := i*8
537/// IF __M[7+i] == 0
538/// result[7+j:j] := __V1[7+j:j]
539/// ELSE
540/// result[7+j:j] := __V2[7+j:j]
541/// FI
542/// ENDFOR
543/// \endcode
544///
545/// \headerfile <immintrin.h>
546///
547/// This intrinsic corresponds to the \c VPBLENDVB instruction.
548///
549/// \param __V1
550/// A 256-bit integer vector containing source values.
551/// \param __V2
552/// A 256-bit integer vector containing source values.
553/// \param __M
554/// A 256-bit integer vector, with bit [7] of each byte specifying the
555/// source for each corresponding byte of the result. When the mask bit
556/// is 0, the byte is copied from \a __V1; otherwise, it is copied from
557/// \a __V2.
558/// \returns A 256-bit integer vector containing the result.
559static __inline__ __m256i __DEFAULT_FN_ATTRS256
560_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
561{
562 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
563 (__v32qi)__M);
564}
565
566/// Merges 16-bit integer values from either of the two 256-bit vectors
567/// \a V1 or \a V2, as specified by the immediate integer operand \a M,
568/// and returns the resulting 256-bit vector of [16 x i16].
569///
570/// \code{.operation}
571/// FOR i := 0 TO 7
572/// j := i*16
573/// IF M[i] == 0
574/// result[7+j:j] := V1[7+j:j]
575/// result[135+j:128+j] := V1[135+j:128+j]
576/// ELSE
577/// result[7+j:j] := V2[7+j:j]
578/// result[135+j:128+j] := V2[135+j:128+j]
579/// FI
580/// ENDFOR
581/// \endcode
582///
583/// \headerfile <immintrin.h>
584///
585/// \code
586/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
587/// \endcode
588///
589/// This intrinsic corresponds to the \c VPBLENDW instruction.
590///
591/// \param V1
592/// A 256-bit vector of [16 x i16] containing source values.
593/// \param V2
594/// A 256-bit vector of [16 x i16] containing source values.
595/// \param M
596/// An immediate 8-bit integer operand, with bits [7:0] specifying the
597/// source for each element of the result. The position of the mask bit
598/// corresponds to the index of a copied value. When a mask bit is 0, the
599/// element is copied from \a V1; otherwise, it is copied from \a V2.
600/// \a M[0] determines the source for elements 0 and 8, \a M[1] for
601/// elements 1 and 9, and so forth.
602/// \returns A 256-bit vector of [16 x i16] containing the result.
603#define _mm256_blend_epi16(V1, V2, M) \
604 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
605 (__v16hi)(__m256i)(V2), (int)(M)))
606
607/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
608/// \a __b for equality and returns the outcomes in the corresponding
609/// bytes of the 256-bit result.
610///
611/// \code{.operation}
612/// FOR i := 0 TO 31
613/// j := i*8
614/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
615/// ENDFOR
616/// \endcode
617///
618/// \headerfile <immintrin.h>
619///
620/// This intrinsic corresponds to the \c VPCMPEQB instruction.
621///
622/// \param __a
623/// A 256-bit integer vector containing one of the inputs.
624/// \param __b
625/// A 256-bit integer vector containing one of the inputs.
626/// \returns A 256-bit integer vector containing the result.
627static __inline__ __m256i __DEFAULT_FN_ATTRS256
628_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
629{
630 return (__m256i)((__v32qi)__a == (__v32qi)__b);
631}
632
633/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
634/// \a __a and \a __b for equality and returns the outcomes in the
635/// corresponding elements of the 256-bit result.
636///
637/// \code{.operation}
638/// FOR i := 0 TO 15
639/// j := i*16
640/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
641/// ENDFOR
642/// \endcode
643///
644/// \headerfile <immintrin.h>
645///
646/// This intrinsic corresponds to the \c VPCMPEQW instruction.
647///
648/// \param __a
649/// A 256-bit vector of [16 x i16] containing one of the inputs.
650/// \param __b
651/// A 256-bit vector of [16 x i16] containing one of the inputs.
652/// \returns A 256-bit vector of [16 x i16] containing the result.
653static __inline__ __m256i __DEFAULT_FN_ATTRS256
654_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
655{
656 return (__m256i)((__v16hi)__a == (__v16hi)__b);
657}
658
659/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
660/// \a __a and \a __b for equality and returns the outcomes in the
661/// corresponding elements of the 256-bit result.
662///
663/// \code{.operation}
664/// FOR i := 0 TO 7
665/// j := i*32
666/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
667/// ENDFOR
668/// \endcode
669///
670/// \headerfile <immintrin.h>
671///
672/// This intrinsic corresponds to the \c VPCMPEQD instruction.
673///
674/// \param __a
675/// A 256-bit vector of [8 x i32] containing one of the inputs.
676/// \param __b
677/// A 256-bit vector of [8 x i32] containing one of the inputs.
678/// \returns A 256-bit vector of [8 x i32] containing the result.
679static __inline__ __m256i __DEFAULT_FN_ATTRS256
680_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
681{
682 return (__m256i)((__v8si)__a == (__v8si)__b);
683}
684
685/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
686/// \a __a and \a __b for equality and returns the outcomes in the
687/// corresponding elements of the 256-bit result.
688///
689/// \code{.operation}
690/// FOR i := 0 TO 3
691/// j := i*64
692/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
693/// ENDFOR
694/// \endcode
695///
696/// \headerfile <immintrin.h>
697///
698/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
699///
700/// \param __a
701/// A 256-bit vector of [4 x i64] containing one of the inputs.
702/// \param __b
703/// A 256-bit vector of [4 x i64] containing one of the inputs.
704/// \returns A 256-bit vector of [4 x i64] containing the result.
705static __inline__ __m256i __DEFAULT_FN_ATTRS256
706_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
707{
708 return (__m256i)((__v4di)__a == (__v4di)__b);
709}
710
711/// Compares corresponding signed bytes in the 256-bit integer vectors in
712/// \a __a and \a __b for greater-than and returns the outcomes in the
713/// corresponding bytes of the 256-bit result.
714///
715/// \code{.operation}
716/// FOR i := 0 TO 31
717/// j := i*8
718/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
719/// ENDFOR
720/// \endcode
721///
722/// \headerfile <immintrin.h>
723///
724/// This intrinsic corresponds to the \c VPCMPGTB instruction.
725///
726/// \param __a
727/// A 256-bit integer vector containing one of the inputs.
728/// \param __b
729/// A 256-bit integer vector containing one of the inputs.
730/// \returns A 256-bit integer vector containing the result.
731static __inline__ __m256i __DEFAULT_FN_ATTRS256
732_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
733{
734 /* This function always performs a signed comparison, but __v32qi is a char
735 which may be signed or unsigned, so use __v32qs. */
736 return (__m256i)((__v32qs)__a > (__v32qs)__b);
737}
738
739/// Compares corresponding signed elements in the 256-bit vectors of
740/// [16 x i16] in \a __a and \a __b for greater-than and returns the
741/// outcomes in the corresponding elements of the 256-bit result.
742///
743/// \code{.operation}
744/// FOR i := 0 TO 15
745/// j := i*16
746/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
747/// ENDFOR
748/// \endcode
749///
750/// \headerfile <immintrin.h>
751///
752/// This intrinsic corresponds to the \c VPCMPGTW instruction.
753///
754/// \param __a
755/// A 256-bit vector of [16 x i16] containing one of the inputs.
756/// \param __b
757/// A 256-bit vector of [16 x i16] containing one of the inputs.
758/// \returns A 256-bit vector of [16 x i16] containing the result.
759static __inline__ __m256i __DEFAULT_FN_ATTRS256
760_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
761{
762 return (__m256i)((__v16hi)__a > (__v16hi)__b);
763}
764
765/// Compares corresponding signed elements in the 256-bit vectors of
766/// [8 x i32] in \a __a and \a __b for greater-than and returns the
767/// outcomes in the corresponding elements of the 256-bit result.
768///
769/// \code{.operation}
770/// FOR i := 0 TO 7
771/// j := i*32
772/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
773/// ENDFOR
774/// \endcode
775///
776/// \headerfile <immintrin.h>
777///
778/// This intrinsic corresponds to the \c VPCMPGTD instruction.
779///
780/// \param __a
781/// A 256-bit vector of [8 x i32] containing one of the inputs.
782/// \param __b
783/// A 256-bit vector of [8 x i32] containing one of the inputs.
784/// \returns A 256-bit vector of [8 x i32] containing the result.
785static __inline__ __m256i __DEFAULT_FN_ATTRS256
786_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
787{
788 return (__m256i)((__v8si)__a > (__v8si)__b);
789}
790
791/// Compares corresponding signed elements in the 256-bit vectors of
792/// [4 x i64] in \a __a and \a __b for greater-than and returns the
793/// outcomes in the corresponding elements of the 256-bit result.
794///
795/// \code{.operation}
796/// FOR i := 0 TO 3
797/// j := i*64
798/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
799/// ENDFOR
800/// \endcode
801///
802/// \headerfile <immintrin.h>
803///
804/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
805///
806/// \param __a
807/// A 256-bit vector of [4 x i64] containing one of the inputs.
808/// \param __b
809/// A 256-bit vector of [4 x i64] containing one of the inputs.
810/// \returns A 256-bit vector of [4 x i64] containing the result.
811static __inline__ __m256i __DEFAULT_FN_ATTRS256
812_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
813{
814 return (__m256i)((__v4di)__a > (__v4di)__b);
815}
816
817/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
818/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
819/// element of the [16 x i16] result (overflow is ignored). Sums from
820/// \a __a are returned in the lower 64 bits of each 128-bit half of the
821/// result; sums from \a __b are returned in the upper 64 bits of each
822/// 128-bit half of the result.
823///
824/// \code{.operation}
825/// FOR i := 0 TO 1
826/// j := i*128
827/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
828/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
829/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
830/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
831/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
832/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
833/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
834/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
835/// ENDFOR
836/// \endcode
837///
838/// \headerfile <immintrin.h>
839///
840/// This intrinsic corresponds to the \c VPHADDW instruction.
841///
842/// \param __a
843/// A 256-bit vector of [16 x i16] containing one of the source operands.
844/// \param __b
845/// A 256-bit vector of [16 x i16] containing one of the source operands.
846/// \returns A 256-bit vector of [16 x i16] containing the sums.
847static __inline__ __m256i __DEFAULT_FN_ATTRS256
848_mm256_hadd_epi16(__m256i __a, __m256i __b)
849{
850 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
851}
852
853/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
854/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
855/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
856/// are returned in the lower 64 bits of each 128-bit half of the result;
857/// sums from \a __b are returned in the upper 64 bits of each 128-bit half
858/// of the result.
859///
860/// \code{.operation}
861/// FOR i := 0 TO 1
862/// j := i*128
863/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
864/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
865/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
866/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
867/// ENDFOR
868/// \endcode
869///
870/// \headerfile <immintrin.h>
871///
872/// This intrinsic corresponds to the \c VPHADDD instruction.
873///
874/// \param __a
875/// A 256-bit vector of [8 x i32] containing one of the source operands.
876/// \param __b
877/// A 256-bit vector of [8 x i32] containing one of the source operands.
878/// \returns A 256-bit vector of [8 x i32] containing the sums.
879static __inline__ __m256i __DEFAULT_FN_ATTRS256
880_mm256_hadd_epi32(__m256i __a, __m256i __b)
881{
882 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
883}
884
885/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
886/// vectors of [16 x i16] using signed saturation and returns each sum in
887/// an element of the [16 x i16] result. Sums from \a __a are returned in
888/// the lower 64 bits of each 128-bit half of the result; sums from \a __b
889/// are returned in the upper 64 bits of each 128-bit half of the result.
890///
891/// \code{.operation}
892/// FOR i := 0 TO 1
893/// j := i*128
894/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
895/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
896/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
897/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
898/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
899/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
900/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
901/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
902/// ENDFOR
903/// \endcode
904///
905/// \headerfile <immintrin.h>
906///
907/// This intrinsic corresponds to the \c VPHADDSW instruction.
908///
909/// \param __a
910/// A 256-bit vector of [16 x i16] containing one of the source operands.
911/// \param __b
912/// A 256-bit vector of [16 x i16] containing one of the source operands.
913/// \returns A 256-bit vector of [16 x i16] containing the sums.
914static __inline__ __m256i __DEFAULT_FN_ATTRS256
915_mm256_hadds_epi16(__m256i __a, __m256i __b)
916{
917 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
918}
919
920/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
921/// vectors of [16 x i16] and returns the lower 16 bits of each difference
922/// in an element of the [16 x i16] result (overflow is ignored).
923/// Differences from \a __a are returned in the lower 64 bits of each
924/// 128-bit half of the result; differences from \a __b are returned in the
925/// upper 64 bits of each 128-bit half of the result.
926///
927/// \code{.operation}
928/// FOR i := 0 TO 1
929/// j := i*128
930/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
931/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
932/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
933/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
934/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
935/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
936/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
937/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
938/// ENDFOR
939/// \endcode
940///
941/// \headerfile <immintrin.h>
942///
943/// This intrinsic corresponds to the \c VPHSUBW instruction.
944///
945/// \param __a
946/// A 256-bit vector of [16 x i16] containing one of the source operands.
947/// \param __b
948/// A 256-bit vector of [16 x i16] containing one of the source operands.
949/// \returns A 256-bit vector of [16 x i16] containing the differences.
950static __inline__ __m256i __DEFAULT_FN_ATTRS256
951_mm256_hsub_epi16(__m256i __a, __m256i __b)
952{
953 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
954}
955
956/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
957/// vectors of [8 x i32] and returns the lower 32 bits of each difference in
958/// an element of the [8 x i32] result (overflow is ignored). Differences
959/// from \a __a are returned in the lower 64 bits of each 128-bit half of
960/// the result; differences from \a __b are returned in the upper 64 bits
961/// of each 128-bit half of the result.
962///
963/// \code{.operation}
964/// FOR i := 0 TO 1
965/// j := i*128
966/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
967/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
968/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
969/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
970/// ENDFOR
971/// \endcode
972///
973/// \headerfile <immintrin.h>
974///
975/// This intrinsic corresponds to the \c VPHSUBD instruction.
976///
977/// \param __a
978/// A 256-bit vector of [8 x i32] containing one of the source operands.
979/// \param __b
980/// A 256-bit vector of [8 x i32] containing one of the source operands.
981/// \returns A 256-bit vector of [8 x i32] containing the differences.
982static __inline__ __m256i __DEFAULT_FN_ATTRS256
983_mm256_hsub_epi32(__m256i __a, __m256i __b)
984{
985 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
986}
987
988/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
989/// vectors of [16 x i16] using signed saturation and returns each sum in
990/// an element of the [16 x i16] result. Differences from \a __a are
991/// returned in the lower 64 bits of each 128-bit half of the result;
992/// differences from \a __b are returned in the upper 64 bits of each
993/// 128-bit half of the result.
994///
995/// \code{.operation}
996/// FOR i := 0 TO 1
997/// j := i*128
998/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
999/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1000/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1001/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1002/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1003/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1004/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1005/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1006/// ENDFOR
1007/// \endcode
1008///
1009/// \headerfile <immintrin.h>
1010///
1011/// This intrinsic corresponds to the \c VPHSUBSW instruction.
1012///
1013/// \param __a
1014/// A 256-bit vector of [16 x i16] containing one of the source operands.
1015/// \param __b
1016/// A 256-bit vector of [16 x i16] containing one of the source operands.
1017/// \returns A 256-bit vector of [16 x i16] containing the differences.
1018static __inline__ __m256i __DEFAULT_FN_ATTRS256
1019_mm256_hsubs_epi16(__m256i __a, __m256i __b)
1020{
1021 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1022}
1023
1024/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1025/// with the corresponding signed byte from the 256-bit integer vector in
1026/// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1027/// pairs of those products using signed saturation to form 16-bit sums
1028/// returned as elements of the [16 x i16] result.
1029///
1030/// \code{.operation}
1031/// FOR i := 0 TO 15
1032/// j := i*16
1033/// temp1 := __a[j+7:j] * __b[j+7:j]
1034/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1035/// result[j+15:j] := SATURATE16(temp1 + temp2)
1036/// ENDFOR
1037/// \endcode
1038///
1039/// \headerfile <immintrin.h>
1040///
1041/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1042///
1043/// \param __a
1044/// A 256-bit vector containing one of the source operands.
1045/// \param __b
1046/// A 256-bit vector containing one of the source operands.
1047/// \returns A 256-bit vector of [16 x i16] containing the result.
1048static __inline__ __m256i __DEFAULT_FN_ATTRS256
1050{
1051 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1052}
1053
1054/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1055/// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1056/// those products to form 32-bit sums returned as elements of the
1057/// [8 x i32] result.
1058///
1059/// There is only one wraparound case: when all four of the 16-bit sources
1060/// are \c 0x8000, the result will be \c 0x80000000.
1061///
1062/// \code{.operation}
1063/// FOR i := 0 TO 7
1064/// j := i*32
1065/// temp1 := __a[j+15:j] * __b[j+15:j]
1066/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1067/// result[j+31:j] := temp1 + temp2
1068/// ENDFOR
1069/// \endcode
1070///
1071/// \headerfile <immintrin.h>
1072///
1073/// This intrinsic corresponds to the \c VPMADDWD instruction.
1074///
1075/// \param __a
1076/// A 256-bit vector of [16 x i16] containing one of the source operands.
1077/// \param __b
1078/// A 256-bit vector of [16 x i16] containing one of the source operands.
1079/// \returns A 256-bit vector of [8 x i32] containing the result.
1080static __inline__ __m256i __DEFAULT_FN_ATTRS256
1081_mm256_madd_epi16(__m256i __a, __m256i __b)
1082{
1083 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1084}
1085
1086/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1087/// in \a __a and \a __b and returns the larger of each pair in the
1088/// corresponding byte of the 256-bit result.
1089///
1090/// \headerfile <immintrin.h>
1091///
1092/// This intrinsic corresponds to the \c VPMAXSB instruction.
1093///
1094/// \param __a
1095/// A 256-bit integer vector.
1096/// \param __b
1097/// A 256-bit integer vector.
1098/// \returns A 256-bit integer vector containing the result.
1099static __inline__ __m256i __DEFAULT_FN_ATTRS256
1100_mm256_max_epi8(__m256i __a, __m256i __b)
1101{
1102 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1103}
1104
1105/// Compares the corresponding signed 16-bit integers in the two 256-bit
1106/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1107/// each pair in the corresponding element of the 256-bit result.
1108///
1109/// \headerfile <immintrin.h>
1110///
1111/// This intrinsic corresponds to the \c VPMAXSW instruction.
1112///
1113/// \param __a
1114/// A 256-bit vector of [16 x i16].
1115/// \param __b
1116/// A 256-bit vector of [16 x i16].
1117/// \returns A 256-bit vector of [16 x i16] containing the result.
1118static __inline__ __m256i __DEFAULT_FN_ATTRS256
1119_mm256_max_epi16(__m256i __a, __m256i __b)
1120{
1121 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1122}
1123
1124/// Compares the corresponding signed 32-bit integers in the two 256-bit
1125/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1126/// each pair in the corresponding element of the 256-bit result.
1127///
1128/// \headerfile <immintrin.h>
1129///
1130/// This intrinsic corresponds to the \c VPMAXSD instruction.
1131///
1132/// \param __a
1133/// A 256-bit vector of [8 x i32].
1134/// \param __b
1135/// A 256-bit vector of [8 x i32].
1136/// \returns A 256-bit vector of [8 x i32] containing the result.
1137static __inline__ __m256i __DEFAULT_FN_ATTRS256
1138_mm256_max_epi32(__m256i __a, __m256i __b)
1139{
1140 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1141}
1142
1143/// Compares the corresponding unsigned bytes in the two 256-bit integer
1144/// vectors in \a __a and \a __b and returns the larger of each pair in
1145/// the corresponding byte of the 256-bit result.
1146///
1147/// \headerfile <immintrin.h>
1148///
1149/// This intrinsic corresponds to the \c VPMAXUB instruction.
1150///
1151/// \param __a
1152/// A 256-bit integer vector.
1153/// \param __b
1154/// A 256-bit integer vector.
1155/// \returns A 256-bit integer vector containing the result.
1156static __inline__ __m256i __DEFAULT_FN_ATTRS256
1157_mm256_max_epu8(__m256i __a, __m256i __b)
1158{
1159 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1160}
1161
1162/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1163/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1164/// each pair in the corresponding element of the 256-bit result.
1165///
1166/// \headerfile <immintrin.h>
1167///
1168/// This intrinsic corresponds to the \c VPMAXUW instruction.
1169///
1170/// \param __a
1171/// A 256-bit vector of [16 x i16].
1172/// \param __b
1173/// A 256-bit vector of [16 x i16].
1174/// \returns A 256-bit vector of [16 x i16] containing the result.
1175static __inline__ __m256i __DEFAULT_FN_ATTRS256
1176_mm256_max_epu16(__m256i __a, __m256i __b)
1177{
1178 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1179}
1180
1181/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1182/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1183/// each pair in the corresponding element of the 256-bit result.
1184///
1185/// \headerfile <immintrin.h>
1186///
1187/// This intrinsic corresponds to the \c VPMAXUD instruction.
1188///
1189/// \param __a
1190/// A 256-bit vector of [8 x i32].
1191/// \param __b
1192/// A 256-bit vector of [8 x i32].
1193/// \returns A 256-bit vector of [8 x i32] containing the result.
1194static __inline__ __m256i __DEFAULT_FN_ATTRS256
1195_mm256_max_epu32(__m256i __a, __m256i __b)
1196{
1197 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1198}
1199
1200/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1201/// in \a __a and \a __b and returns the smaller of each pair in the
1202/// corresponding byte of the 256-bit result.
1203///
1204/// \headerfile <immintrin.h>
1205///
1206/// This intrinsic corresponds to the \c VPMINSB instruction.
1207///
1208/// \param __a
1209/// A 256-bit integer vector.
1210/// \param __b
1211/// A 256-bit integer vector.
1212/// \returns A 256-bit integer vector containing the result.
1213static __inline__ __m256i __DEFAULT_FN_ATTRS256
1214_mm256_min_epi8(__m256i __a, __m256i __b)
1215{
1216 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1217}
1218
1219/// Compares the corresponding signed 16-bit integers in the two 256-bit
1220/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1221/// each pair in the corresponding element of the 256-bit result.
1222///
1223/// \headerfile <immintrin.h>
1224///
1225/// This intrinsic corresponds to the \c VPMINSW instruction.
1226///
1227/// \param __a
1228/// A 256-bit vector of [16 x i16].
1229/// \param __b
1230/// A 256-bit vector of [16 x i16].
1231/// \returns A 256-bit vector of [16 x i16] containing the result.
1232static __inline__ __m256i __DEFAULT_FN_ATTRS256
1233_mm256_min_epi16(__m256i __a, __m256i __b)
1234{
1235 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1236}
1237
1238/// Compares the corresponding signed 32-bit integers in the two 256-bit
1239/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1240/// each pair in the corresponding element of the 256-bit result.
1241///
1242/// \headerfile <immintrin.h>
1243///
1244/// This intrinsic corresponds to the \c VPMINSD instruction.
1245///
1246/// \param __a
1247/// A 256-bit vector of [8 x i32].
1248/// \param __b
1249/// A 256-bit vector of [8 x i32].
1250/// \returns A 256-bit vector of [8 x i32] containing the result.
1251static __inline__ __m256i __DEFAULT_FN_ATTRS256
1252_mm256_min_epi32(__m256i __a, __m256i __b)
1253{
1254 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1255}
1256
1257/// Compares the corresponding unsigned bytes in the two 256-bit integer
1258/// vectors in \a __a and \a __b and returns the smaller of each pair in
1259/// the corresponding byte of the 256-bit result.
1260///
1261/// \headerfile <immintrin.h>
1262///
1263/// This intrinsic corresponds to the \c VPMINUB instruction.
1264///
1265/// \param __a
1266/// A 256-bit integer vector.
1267/// \param __b
1268/// A 256-bit integer vector.
1269/// \returns A 256-bit integer vector containing the result.
1270static __inline__ __m256i __DEFAULT_FN_ATTRS256
1271_mm256_min_epu8(__m256i __a, __m256i __b)
1272{
1273 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1274}
1275
1276/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1277/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1278/// each pair in the corresponding element of the 256-bit result.
1279///
1280/// \headerfile <immintrin.h>
1281///
1282/// This intrinsic corresponds to the \c VPMINUW instruction.
1283///
1284/// \param __a
1285/// A 256-bit vector of [16 x i16].
1286/// \param __b
1287/// A 256-bit vector of [16 x i16].
1288/// \returns A 256-bit vector of [16 x i16] containing the result.
1289static __inline__ __m256i __DEFAULT_FN_ATTRS256
1290_mm256_min_epu16(__m256i __a, __m256i __b)
1291{
1292 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1293}
1294
1295/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1296/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1297/// each pair in the corresponding element of the 256-bit result.
1298///
1299/// \headerfile <immintrin.h>
1300///
1301/// This intrinsic corresponds to the \c VPMINUD instruction.
1302///
1303/// \param __a
1304/// A 256-bit vector of [8 x i32].
1305/// \param __b
1306/// A 256-bit vector of [8 x i32].
1307/// \returns A 256-bit vector of [8 x i32] containing the result.
1308static __inline__ __m256i __DEFAULT_FN_ATTRS256
1309_mm256_min_epu32(__m256i __a, __m256i __b)
1310{
1311 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1312}
1313
1314/// Creates a 32-bit integer mask from the most significant bit of each byte
1315/// in the 256-bit integer vector in \a __a and returns the result.
1316///
1317/// \code{.operation}
1318/// FOR i := 0 TO 31
1319/// j := i*8
1320/// result[i] := __a[j+7]
1321/// ENDFOR
1322/// \endcode
1323///
1324/// \headerfile <immintrin.h>
1325///
1326/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1327///
1328/// \param __a
1329/// A 256-bit integer vector containing the source bytes.
1330/// \returns The 32-bit integer mask.
1331static __inline__ int __DEFAULT_FN_ATTRS256
1333{
1334 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1335}
1336
1337/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1338/// the 16-bit values in the corresponding elements of a 256-bit vector
1339/// of [16 x i16].
1340///
1341/// \code{.operation}
1342/// FOR i := 0 TO 15
1343/// j := i*8
1344/// k := i*16
1345/// result[k+15:k] := SignExtend(__V[j+7:j])
1346/// ENDFOR
1347/// \endcode
1348///
1349/// \headerfile <immintrin.h>
1350///
1351/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1352///
1353/// \param __V
1354/// A 128-bit integer vector containing the source bytes.
1355/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1356/// values.
1357static __inline__ __m256i __DEFAULT_FN_ATTRS256
1359{
1360 /* This function always performs a signed extension, but __v16qi is a char
1361 which may be signed or unsigned, so use __v16qs. */
1362 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1363}
1364
1365/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1366/// \a __V and returns the 32-bit values in the corresponding elements of a
1367/// 256-bit vector of [8 x i32].
1368///
1369/// \code{.operation}
1370/// FOR i := 0 TO 7
1371/// j := i*8
1372/// k := i*32
1373/// result[k+31:k] := SignExtend(__V[j+7:j])
1374/// ENDFOR
1375/// \endcode
1376///
1377/// \headerfile <immintrin.h>
1378///
1379/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1380///
1381/// \param __V
1382/// A 128-bit integer vector containing the source bytes.
1383/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1384/// values.
1385static __inline__ __m256i __DEFAULT_FN_ATTRS256
1387{
1388 /* This function always performs a signed extension, but __v16qi is a char
1389 which may be signed or unsigned, so use __v16qs. */
1390 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1391}
1392
1393/// Sign-extends the first four bytes from the 128-bit integer vector in
1394/// \a __V and returns the 64-bit values in the corresponding elements of a
1395/// 256-bit vector of [4 x i64].
1396///
1397/// \code{.operation}
1398/// result[63:0] := SignExtend(__V[7:0])
1399/// result[127:64] := SignExtend(__V[15:8])
1400/// result[191:128] := SignExtend(__V[23:16])
1401/// result[255:192] := SignExtend(__V[31:24])
1402/// \endcode
1403///
1404/// \headerfile <immintrin.h>
1405///
1406/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1407///
1408/// \param __V
1409/// A 128-bit integer vector containing the source bytes.
1410/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1411/// values.
1412static __inline__ __m256i __DEFAULT_FN_ATTRS256
1414{
1415 /* This function always performs a signed extension, but __v16qi is a char
1416 which may be signed or unsigned, so use __v16qs. */
1417 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1418}
1419
1420/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1421/// \a __V and returns the 32-bit values in the corresponding elements of a
1422/// 256-bit vector of [8 x i32].
1423///
1424/// \code{.operation}
1425/// FOR i := 0 TO 7
1426/// j := i*16
1427/// k := i*32
1428/// result[k+31:k] := SignExtend(__V[j+15:j])
1429/// ENDFOR
1430/// \endcode
1431///
1432/// \headerfile <immintrin.h>
1433///
1434/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1435///
1436/// \param __V
1437/// A 128-bit vector of [8 x i16] containing the source values.
1438/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1439/// values.
1440static __inline__ __m256i __DEFAULT_FN_ATTRS256
1442{
1443 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1444}
1445
1446/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1447/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1448/// elements of a 256-bit vector of [4 x i64].
1449///
1450/// \code{.operation}
1451/// result[63:0] := SignExtend(__V[15:0])
1452/// result[127:64] := SignExtend(__V[31:16])
1453/// result[191:128] := SignExtend(__V[47:32])
1454/// result[255:192] := SignExtend(__V[64:48])
1455/// \endcode
1456///
1457/// \headerfile <immintrin.h>
1458///
1459/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1460///
1461/// \param __V
1462/// A 128-bit vector of [8 x i16] containing the source values.
1463/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1464/// values.
1465static __inline__ __m256i __DEFAULT_FN_ATTRS256
1467{
1468 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1469}
1470
1471/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1472/// \a __V and returns the 64-bit values in the corresponding elements of a
1473/// 256-bit vector of [4 x i64].
1474///
1475/// \code{.operation}
1476/// result[63:0] := SignExtend(__V[31:0])
1477/// result[127:64] := SignExtend(__V[63:32])
1478/// result[191:128] := SignExtend(__V[95:64])
1479/// result[255:192] := SignExtend(__V[127:96])
1480/// \endcode
1481///
1482/// \headerfile <immintrin.h>
1483///
1484/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1485///
1486/// \param __V
1487/// A 128-bit vector of [4 x i32] containing the source values.
1488/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1489/// values.
1490static __inline__ __m256i __DEFAULT_FN_ATTRS256
1492{
1493 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1494}
1495
1496/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1497/// the 16-bit values in the corresponding elements of a 256-bit vector
1498/// of [16 x i16].
1499///
1500/// \code{.operation}
1501/// FOR i := 0 TO 15
1502/// j := i*8
1503/// k := i*16
1504/// result[k+15:k] := ZeroExtend(__V[j+7:j])
1505/// ENDFOR
1506/// \endcode
1507///
1508/// \headerfile <immintrin.h>
1509///
1510/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1511///
1512/// \param __V
1513/// A 128-bit integer vector containing the source bytes.
1514/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1515/// values.
1516static __inline__ __m256i __DEFAULT_FN_ATTRS256
1518{
1519 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1520}
1521
1522/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1523/// \a __V and returns the 32-bit values in the corresponding elements of a
1524/// 256-bit vector of [8 x i32].
1525///
1526/// \code{.operation}
1527/// FOR i := 0 TO 7
1528/// j := i*8
1529/// k := i*32
1530/// result[k+31:k] := ZeroExtend(__V[j+7:j])
1531/// ENDFOR
1532/// \endcode
1533///
1534/// \headerfile <immintrin.h>
1535///
1536/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1537///
1538/// \param __V
1539/// A 128-bit integer vector containing the source bytes.
1540/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1541/// values.
1542static __inline__ __m256i __DEFAULT_FN_ATTRS256
1544{
1545 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1546}
1547
1548/// Zero-extends the first four bytes from the 128-bit integer vector in
1549/// \a __V and returns the 64-bit values in the corresponding elements of a
1550/// 256-bit vector of [4 x i64].
1551///
1552/// \code{.operation}
1553/// result[63:0] := ZeroExtend(__V[7:0])
1554/// result[127:64] := ZeroExtend(__V[15:8])
1555/// result[191:128] := ZeroExtend(__V[23:16])
1556/// result[255:192] := ZeroExtend(__V[31:24])
1557/// \endcode
1558///
1559/// \headerfile <immintrin.h>
1560///
1561/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1562///
1563/// \param __V
1564/// A 128-bit integer vector containing the source bytes.
1565/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1566/// values.
1567static __inline__ __m256i __DEFAULT_FN_ATTRS256
1569{
1570 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1571}
1572
1573/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1574/// \a __V and returns the 32-bit values in the corresponding elements of a
1575/// 256-bit vector of [8 x i32].
1576///
1577/// \code{.operation}
1578/// FOR i := 0 TO 7
1579/// j := i*16
1580/// k := i*32
1581/// result[k+31:k] := ZeroExtend(__V[j+15:j])
1582/// ENDFOR
1583/// \endcode
1584///
1585/// \headerfile <immintrin.h>
1586///
1587/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1588///
1589/// \param __V
1590/// A 128-bit vector of [8 x i16] containing the source values.
1591/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1592/// values.
1593static __inline__ __m256i __DEFAULT_FN_ATTRS256
1595{
1596 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1597}
1598
1599/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1600/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1601/// elements of a 256-bit vector of [4 x i64].
1602///
1603/// \code{.operation}
1604/// result[63:0] := ZeroExtend(__V[15:0])
1605/// result[127:64] := ZeroExtend(__V[31:16])
1606/// result[191:128] := ZeroExtend(__V[47:32])
1607/// result[255:192] := ZeroExtend(__V[64:48])
1608/// \endcode
1609///
1610/// \headerfile <immintrin.h>
1611///
1612/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1613///
1614/// \param __V
1615/// A 128-bit vector of [8 x i16] containing the source values.
1616/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1617/// values.
1618static __inline__ __m256i __DEFAULT_FN_ATTRS256
1620{
1621 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1622}
1623
1624/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1625/// \a __V and returns the 64-bit values in the corresponding elements of a
1626/// 256-bit vector of [4 x i64].
1627///
1628/// \code{.operation}
1629/// result[63:0] := ZeroExtend(__V[31:0])
1630/// result[127:64] := ZeroExtend(__V[63:32])
1631/// result[191:128] := ZeroExtend(__V[95:64])
1632/// result[255:192] := ZeroExtend(__V[127:96])
1633/// \endcode
1634///
1635/// \headerfile <immintrin.h>
1636///
1637/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1638///
1639/// \param __V
1640/// A 128-bit vector of [4 x i32] containing the source values.
1641/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1642/// values.
1643static __inline__ __m256i __DEFAULT_FN_ATTRS256
1645{
1646 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1647}
1648
1649/// Multiplies signed 32-bit integers from even-numbered elements of two
1650/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1651/// [4 x i64] result.
1652///
1653/// \code{.operation}
1654/// result[63:0] := __a[31:0] * __b[31:0]
1655/// result[127:64] := __a[95:64] * __b[95:64]
1656/// result[191:128] := __a[159:128] * __b[159:128]
1657/// result[255:192] := __a[223:192] * __b[223:192]
1658/// \endcode
1659///
1660/// \headerfile <immintrin.h>
1661///
1662/// This intrinsic corresponds to the \c VPMULDQ instruction.
1663///
1664/// \param __a
1665/// A 256-bit vector of [8 x i32] containing one of the source operands.
1666/// \param __b
1667/// A 256-bit vector of [8 x i32] containing one of the source operands.
1668/// \returns A 256-bit vector of [4 x i64] containing the products.
1669static __inline__ __m256i __DEFAULT_FN_ATTRS256
1670_mm256_mul_epi32(__m256i __a, __m256i __b)
1671{
1672 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1673}
1674
1675/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676/// [16 x i16], truncates the 32-bit results to the most significant 18
1677/// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1678/// product in the [16 x i16] result.
1679///
1680/// \code{.operation}
1681/// FOR i := 0 TO 15
1682/// j := i*16
1683/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1684/// result[j+15:j] := temp[16:1]
1685/// \endcode
1686///
1687/// \headerfile <immintrin.h>
1688///
1689/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1690///
1691/// \param __a
1692/// A 256-bit vector of [16 x i16] containing one of the source operands.
1693/// \param __b
1694/// A 256-bit vector of [16 x i16] containing one of the source operands.
1695/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1696static __inline__ __m256i __DEFAULT_FN_ATTRS256
1697_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1698{
1699 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1700}
1701
1702/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1703/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1704/// [16 x i16] result.
1705///
1706/// \headerfile <immintrin.h>
1707///
1708/// This intrinsic corresponds to the \c VPMULHUW instruction.
1709///
1710/// \param __a
1711/// A 256-bit vector of [16 x i16] containing one of the source operands.
1712/// \param __b
1713/// A 256-bit vector of [16 x i16] containing one of the source operands.
1714/// \returns A 256-bit vector of [16 x i16] containing the products.
1715static __inline__ __m256i __DEFAULT_FN_ATTRS256
1716_mm256_mulhi_epu16(__m256i __a, __m256i __b)
1717{
1718 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1719}
1720
1721/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1722/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1723/// [16 x i16] result.
1724///
1725/// \headerfile <immintrin.h>
1726///
1727/// This intrinsic corresponds to the \c VPMULHW instruction.
1728///
1729/// \param __a
1730/// A 256-bit vector of [16 x i16] containing one of the source operands.
1731/// \param __b
1732/// A 256-bit vector of [16 x i16] containing one of the source operands.
1733/// \returns A 256-bit vector of [16 x i16] containing the products.
1734static __inline__ __m256i __DEFAULT_FN_ATTRS256
1735_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1736{
1737 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1738}
1739
1740/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1741/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1742/// [16 x i16] result.
1743///
1744/// \headerfile <immintrin.h>
1745///
1746/// This intrinsic corresponds to the \c VPMULLW instruction.
1747///
1748/// \param __a
1749/// A 256-bit vector of [16 x i16] containing one of the source operands.
1750/// \param __b
1751/// A 256-bit vector of [16 x i16] containing one of the source operands.
1752/// \returns A 256-bit vector of [16 x i16] containing the products.
1753static __inline__ __m256i __DEFAULT_FN_ATTRS256
1754_mm256_mullo_epi16(__m256i __a, __m256i __b)
1755{
1756 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1757}
1758
1759/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1760/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1761/// [8 x i32] result.
1762///
1763/// \headerfile <immintrin.h>
1764///
1765/// This intrinsic corresponds to the \c VPMULLD instruction.
1766///
1767/// \param __a
1768/// A 256-bit vector of [8 x i32] containing one of the source operands.
1769/// \param __b
1770/// A 256-bit vector of [8 x i32] containing one of the source operands.
1771/// \returns A 256-bit vector of [8 x i32] containing the products.
1772static __inline__ __m256i __DEFAULT_FN_ATTRS256
1773_mm256_mullo_epi32 (__m256i __a, __m256i __b)
1774{
1775 return (__m256i)((__v8su)__a * (__v8su)__b);
1776}
1777
1778/// Multiplies unsigned 32-bit integers from even-numered elements of two
1779/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1780/// [4 x i64] result.
1781///
1782/// \code{.operation}
1783/// result[63:0] := __a[31:0] * __b[31:0]
1784/// result[127:64] := __a[95:64] * __b[95:64]
1785/// result[191:128] := __a[159:128] * __b[159:128]
1786/// result[255:192] := __a[223:192] * __b[223:192]
1787/// \endcode
1788///
1789/// \headerfile <immintrin.h>
1790///
1791/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1792///
1793/// \param __a
1794/// A 256-bit vector of [8 x i32] containing one of the source operands.
1795/// \param __b
1796/// A 256-bit vector of [8 x i32] containing one of the source operands.
1797/// \returns A 256-bit vector of [4 x i64] containing the products.
1798static __inline__ __m256i __DEFAULT_FN_ATTRS256
1799_mm256_mul_epu32(__m256i __a, __m256i __b)
1800{
1801 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1802}
1803
1804/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1805/// \a __b.
1806///
1807/// \headerfile <immintrin.h>
1808///
1809/// This intrinsic corresponds to the \c VPOR instruction.
1810///
1811/// \param __a
1812/// A 256-bit integer vector.
1813/// \param __b
1814/// A 256-bit integer vector.
1815/// \returns A 256-bit integer vector containing the result.
1816static __inline__ __m256i __DEFAULT_FN_ATTRS256
1817_mm256_or_si256(__m256i __a, __m256i __b)
1818{
1819 return (__m256i)((__v4du)__a | (__v4du)__b);
1820}
1821
1822/// Computes four sum of absolute difference (SAD) operations on sets of eight
1823/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1824/// \a __b.
1825///
1826/// One SAD result is computed for each set of eight bytes from \a __a and
1827/// eight bytes from \a __b. The zero-extended SAD value is returned in the
1828/// corresponding 64-bit element of the result.
1829///
1830/// A single SAD operation takes the differences between the corresponding
1831/// bytes of \a __a and \a __b, takes the absolute value of each difference,
1832/// and sums these eight values to form one 16-bit result. This operation
1833/// is repeated four times with successive sets of eight bytes.
1834///
1835/// \code{.operation}
1836/// FOR i := 0 TO 3
1837/// j := i*64
1838/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1839/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1840/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1841/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1842/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1843/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1844/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1845/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1846/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1847/// temp4 + temp5 + temp6 + temp7
1848/// result[j+63:j+16] := 0
1849/// ENDFOR
1850/// \endcode
1851///
1852/// \headerfile <immintrin.h>
1853///
1854/// This intrinsic corresponds to the \c VPSADBW instruction.
1855///
1856/// \param __a
1857/// A 256-bit integer vector.
1858/// \param __b
1859/// A 256-bit integer vector.
1860/// \returns A 256-bit integer vector containing the result.
1861static __inline__ __m256i __DEFAULT_FN_ATTRS256
1862_mm256_sad_epu8(__m256i __a, __m256i __b)
1863{
1864 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1865}
1866
1867/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1868/// to control information in the 256-bit integer vector \a __b, and
1869/// returns the 256-bit result. In effect there are two separate 128-bit
1870/// shuffles in the lower and upper halves.
1871///
1872/// \code{.operation}
1873/// FOR i := 0 TO 31
1874/// j := i*8
1875/// IF __b[j+7] == 1
1876/// result[j+7:j] := 0
1877/// ELSE
1878/// k := __b[j+3:j] * 8
1879/// IF i > 15
1880/// k := k + 128
1881/// FI
1882/// result[j+7:j] := __a[k+7:k]
1883/// FI
1884/// ENDFOR
1885/// \endcode
1886///
1887/// \headerfile <immintrin.h>
1888///
1889/// This intrinsic corresponds to the \c VPSHUFB instruction.
1890///
1891/// \param __a
1892/// A 256-bit integer vector containing source values.
1893/// \param __b
1894/// A 256-bit integer vector containing control information to determine
1895/// what goes into the corresponding byte of the result. If bit 7 of the
1896/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1897/// control byte specify the index (within the same 128-bit half) of \a __a
1898/// to copy to the result byte.
1899/// \returns A 256-bit integer vector containing the result.
1900static __inline__ __m256i __DEFAULT_FN_ATTRS256
1901_mm256_shuffle_epi8(__m256i __a, __m256i __b)
1902{
1903 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1904}
1905
1906/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1907/// according to control information in the integer literal \a imm, and
1908/// returns the 256-bit result. In effect there are two parallel 128-bit
1909/// shuffles in the lower and upper halves.
1910///
1911/// \code{.operation}
1912/// FOR i := 0 to 3
1913/// j := i*32
1914/// k := (imm >> i*2)[1:0] * 32
1915/// result[j+31:j] := a[k+31:k]
1916/// result[128+j+31:128+j] := a[128+k+31:128+k]
1917/// ENDFOR
1918/// \endcode
1919///
1920/// \headerfile <immintrin.h>
1921///
1922/// \code
1923/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1924/// \endcode
1925///
1926/// This intrinsic corresponds to the \c VPSHUFB instruction.
1927///
1928/// \param a
1929/// A 256-bit vector of [8 x i32] containing source values.
1930/// \param imm
1931/// An immediate 8-bit value specifying which elements to copy from \a a.
1932/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1933/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1934/// forth.
1935/// \returns A 256-bit vector of [8 x i32] containing the result.
1936#define _mm256_shuffle_epi32(a, imm) \
1937 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1938
1939/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1940/// according to control information in the integer literal \a imm, and
1941/// returns the 256-bit result. The upper 64 bits of each 128-bit half
1942/// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1943/// copied from \a a unchanged.
1944///
1945/// \code{.operation}
1946/// result[63:0] := a[63:0]
1947/// result[191:128] := a[191:128]
1948/// FOR i := 0 TO 3
1949/// j := i * 16 + 64
1950/// k := (imm >> i*2)[1:0] * 16 + 64
1951/// result[j+15:j] := a[k+15:k]
1952/// result[128+j+15:128+j] := a[128+k+15:128+k]
1953/// ENDFOR
1954/// \endcode
1955///
1956/// \headerfile <immintrin.h>
1957///
1958/// \code
1959/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1960/// \endcode
1961///
1962/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1963///
1964/// \param a
1965/// A 256-bit vector of [16 x i16] containing source values.
1966/// \param imm
1967/// An immediate 8-bit value specifying which elements to copy from \a a.
1968/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1969/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1970/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1971/// \returns A 256-bit vector of [16 x i16] containing the result.
1972#define _mm256_shufflehi_epi16(a, imm) \
1973 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1974
1975/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1976/// according to control information in the integer literal \a imm, and
1977/// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1978/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1979/// copied from \a a unchanged.
1980///
1981/// \code{.operation}
1982/// result[127:64] := a[127:64]
1983/// result[255:192] := a[255:192]
1984/// FOR i := 0 TO 3
1985/// j := i * 16
1986/// k := (imm >> i*2)[1:0] * 16
1987/// result[j+15:j] := a[k+15:k]
1988/// result[128+j+15:128+j] := a[128+k+15:128+k]
1989/// ENDFOR
1990/// \endcode
1991///
1992/// \headerfile <immintrin.h>
1993///
1994/// \code
1995/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1996/// \endcode
1997///
1998/// This intrinsic corresponds to the \c VPSHUFLW instruction.
1999///
2000/// \param a
2001/// A 256-bit vector of [16 x i16] to use as a source of data for the
2002/// result.
2003/// \param imm
2004/// An immediate 8-bit value specifying which elements to copy from \a a.
2005/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2006/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2007/// forth.
2008/// \returns A 256-bit vector of [16 x i16] containing the result.
2009#define _mm256_shufflelo_epi16(a, imm) \
2010 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2011
2012/// Sets each byte of the result to the corresponding byte of the 256-bit
2013/// integer vector in \a __a, the negative of that byte, or zero, depending
2014/// on whether the corresponding byte of the 256-bit integer vector in
2015/// \a __b is greater than zero, less than zero, or equal to zero,
2016/// respectively.
2017///
2018/// \headerfile <immintrin.h>
2019///
2020/// This intrinsic corresponds to the \c VPSIGNB instruction.
2021///
2022/// \param __a
2023/// A 256-bit integer vector.
2024/// \param __b
2025/// A 256-bit integer vector].
2026/// \returns A 256-bit integer vector containing the result.
2027static __inline__ __m256i __DEFAULT_FN_ATTRS256
2028_mm256_sign_epi8(__m256i __a, __m256i __b)
2029{
2030 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2031}
2032
2033/// Sets each element of the result to the corresponding element of the
2034/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2035/// or zero, depending on whether the corresponding element of the 256-bit
2036/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2037/// equal to zero, respectively.
2038///
2039/// \headerfile <immintrin.h>
2040///
2041/// This intrinsic corresponds to the \c VPSIGNW instruction.
2042///
2043/// \param __a
2044/// A 256-bit vector of [16 x i16].
2045/// \param __b
2046/// A 256-bit vector of [16 x i16].
2047/// \returns A 256-bit vector of [16 x i16] containing the result.
2048static __inline__ __m256i __DEFAULT_FN_ATTRS256
2049_mm256_sign_epi16(__m256i __a, __m256i __b)
2050{
2051 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2052}
2053
2054/// Sets each element of the result to the corresponding element of the
2055/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2056/// zero, depending on whether the corresponding element of the 256-bit
2057/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2058/// equal to zero, respectively.
2059///
2060/// \headerfile <immintrin.h>
2061///
2062/// This intrinsic corresponds to the \c VPSIGND instruction.
2063///
2064/// \param __a
2065/// A 256-bit vector of [8 x i32].
2066/// \param __b
2067/// A 256-bit vector of [8 x i32].
2068/// \returns A 256-bit vector of [8 x i32] containing the result.
2069static __inline__ __m256i __DEFAULT_FN_ATTRS256
2070_mm256_sign_epi32(__m256i __a, __m256i __b)
2071{
2072 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2073}
2074
2075/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2076/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2077/// is greater than 15, the returned result is all zeroes.
2078///
2079/// \headerfile <immintrin.h>
2080///
2081/// \code
2082/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2083/// \endcode
2084///
2085/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2086///
2087/// \param a
2088/// A 256-bit integer vector to be shifted.
2089/// \param imm
2090/// An unsigned immediate value specifying the shift count (in bytes).
2091/// \returns A 256-bit integer vector containing the result.
2092#define _mm256_slli_si256(a, imm) \
2093 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2094
2095/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2096/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2097/// is greater than 15, the returned result is all zeroes.
2098///
2099/// \headerfile <immintrin.h>
2100///
2101/// \code
2102/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2103/// \endcode
2104///
2105/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2106///
2107/// \param a
2108/// A 256-bit integer vector to be shifted.
2109/// \param imm
2110/// An unsigned immediate value specifying the shift count (in bytes).
2111/// \returns A 256-bit integer vector containing the result.
2112#define _mm256_bslli_epi128(a, imm) \
2113 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2114
2115/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2116/// left by \a __count bits, shifting in zero bits, and returns the result.
2117/// If \a __count is greater than 15, the returned result is all zeroes.
2118///
2119/// \headerfile <immintrin.h>
2120///
2121/// This intrinsic corresponds to the \c VPSLLW instruction.
2122///
2123/// \param __a
2124/// A 256-bit vector of [16 x i16] to be shifted.
2125/// \param __count
2126/// An unsigned integer value specifying the shift count (in bits).
2127/// \returns A 256-bit vector of [16 x i16] containing the result.
2128static __inline__ __m256i __DEFAULT_FN_ATTRS256
2129_mm256_slli_epi16(__m256i __a, int __count)
2130{
2131 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2132}
2133
2134/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2135/// left by the number of bits specified by the lower 64 bits of \a __count,
2136/// shifting in zero bits, and returns the result. If \a __count is greater
2137/// than 15, the returned result is all zeroes.
2138///
2139/// \headerfile <immintrin.h>
2140///
2141/// This intrinsic corresponds to the \c VPSLLW instruction.
2142///
2143/// \param __a
2144/// A 256-bit vector of [16 x i16] to be shifted.
2145/// \param __count
2146/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2147/// shift count (in bits). The upper element is ignored.
2148/// \returns A 256-bit vector of [16 x i16] containing the result.
2149static __inline__ __m256i __DEFAULT_FN_ATTRS256
2150_mm256_sll_epi16(__m256i __a, __m128i __count)
2151{
2152 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2153}
2154
2155/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2156/// left by \a __count bits, shifting in zero bits, and returns the result.
2157/// If \a __count is greater than 31, the returned result is all zeroes.
2158///
2159/// \headerfile <immintrin.h>
2160///
2161/// This intrinsic corresponds to the \c VPSLLD instruction.
2162///
2163/// \param __a
2164/// A 256-bit vector of [8 x i32] to be shifted.
2165/// \param __count
2166/// An unsigned integer value specifying the shift count (in bits).
2167/// \returns A 256-bit vector of [8 x i32] containing the result.
2168static __inline__ __m256i __DEFAULT_FN_ATTRS256
2169_mm256_slli_epi32(__m256i __a, int __count)
2170{
2171 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2172}
2173
2174/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2175/// left by the number of bits given in the lower 64 bits of \a __count,
2176/// shifting in zero bits, and returns the result. If \a __count is greater
2177/// than 31, the returned result is all zeroes.
2178///
2179/// \headerfile <immintrin.h>
2180///
2181/// This intrinsic corresponds to the \c VPSLLD instruction.
2182///
2183/// \param __a
2184/// A 256-bit vector of [8 x i32] to be shifted.
2185/// \param __count
2186/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2187/// shift count (in bits). The upper element is ignored.
2188/// \returns A 256-bit vector of [8 x i32] containing the result.
2189static __inline__ __m256i __DEFAULT_FN_ATTRS256
2190_mm256_sll_epi32(__m256i __a, __m128i __count)
2191{
2192 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2193}
2194
2195/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2196/// left by \a __count bits, shifting in zero bits, and returns the result.
2197/// If \a __count is greater than 63, the returned result is all zeroes.
2198///
2199/// \headerfile <immintrin.h>
2200///
2201/// This intrinsic corresponds to the \c VPSLLQ instruction.
2202///
2203/// \param __a
2204/// A 256-bit vector of [4 x i64] to be shifted.
2205/// \param __count
2206/// An unsigned integer value specifying the shift count (in bits).
2207/// \returns A 256-bit vector of [4 x i64] containing the result.
2208static __inline__ __m256i __DEFAULT_FN_ATTRS256
2209_mm256_slli_epi64(__m256i __a, int __count)
2210{
2211 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2212}
2213
2214/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2215/// left by the number of bits given in the lower 64 bits of \a __count,
2216/// shifting in zero bits, and returns the result. If \a __count is greater
2217/// than 63, the returned result is all zeroes.
2218///
2219/// \headerfile <immintrin.h>
2220///
2221/// This intrinsic corresponds to the \c VPSLLQ instruction.
2222///
2223/// \param __a
2224/// A 256-bit vector of [4 x i64] to be shifted.
2225/// \param __count
2226/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2227/// shift count (in bits). The upper element is ignored.
2228/// \returns A 256-bit vector of [4 x i64] containing the result.
2229static __inline__ __m256i __DEFAULT_FN_ATTRS256
2230_mm256_sll_epi64(__m256i __a, __m128i __count)
2231{
2232 return __builtin_ia32_psllq256((__v4di)__a, __count);
2233}
2234
2235/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2236/// right by \a __count bits, shifting in sign bits, and returns the result.
2237/// If \a __count is greater than 15, each element of the result is either
2238/// 0 or -1 according to the corresponding input sign bit.
2239///
2240/// \headerfile <immintrin.h>
2241///
2242/// This intrinsic corresponds to the \c VPSRAW instruction.
2243///
2244/// \param __a
2245/// A 256-bit vector of [16 x i16] to be shifted.
2246/// \param __count
2247/// An unsigned integer value specifying the shift count (in bits).
2248/// \returns A 256-bit vector of [16 x i16] containing the result.
2249static __inline__ __m256i __DEFAULT_FN_ATTRS256
2250_mm256_srai_epi16(__m256i __a, int __count)
2251{
2252 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2253}
2254
2255/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2256/// right by the number of bits given in the lower 64 bits of \a __count,
2257/// shifting in sign bits, and returns the result. If \a __count is greater
2258/// than 15, each element of the result is either 0 or -1 according to the
2259/// corresponding input sign bit.
2260///
2261/// \headerfile <immintrin.h>
2262///
2263/// This intrinsic corresponds to the \c VPSRAW instruction.
2264///
2265/// \param __a
2266/// A 256-bit vector of [16 x i16] to be shifted.
2267/// \param __count
2268/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2269/// shift count (in bits). The upper element is ignored.
2270/// \returns A 256-bit vector of [16 x i16] containing the result.
2271static __inline__ __m256i __DEFAULT_FN_ATTRS256
2272_mm256_sra_epi16(__m256i __a, __m128i __count)
2273{
2274 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2275}
2276
2277/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2278/// right by \a __count bits, shifting in sign bits, and returns the result.
2279/// If \a __count is greater than 31, each element of the result is either
2280/// 0 or -1 according to the corresponding input sign bit.
2281///
2282/// \headerfile <immintrin.h>
2283///
2284/// This intrinsic corresponds to the \c VPSRAD instruction.
2285///
2286/// \param __a
2287/// A 256-bit vector of [8 x i32] to be shifted.
2288/// \param __count
2289/// An unsigned integer value specifying the shift count (in bits).
2290/// \returns A 256-bit vector of [8 x i32] containing the result.
2291static __inline__ __m256i __DEFAULT_FN_ATTRS256
2292_mm256_srai_epi32(__m256i __a, int __count)
2293{
2294 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2295}
2296
2297/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2298/// right by the number of bits given in the lower 64 bits of \a __count,
2299/// shifting in sign bits, and returns the result. If \a __count is greater
2300/// than 31, each element of the result is either 0 or -1 according to the
2301/// corresponding input sign bit.
2302///
2303/// \headerfile <immintrin.h>
2304///
2305/// This intrinsic corresponds to the \c VPSRAD instruction.
2306///
2307/// \param __a
2308/// A 256-bit vector of [8 x i32] to be shifted.
2309/// \param __count
2310/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2311/// shift count (in bits). The upper element is ignored.
2312/// \returns A 256-bit vector of [8 x i32] containing the result.
2313static __inline__ __m256i __DEFAULT_FN_ATTRS256
2314_mm256_sra_epi32(__m256i __a, __m128i __count)
2315{
2316 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2317}
2318
2319/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2320/// \a imm bytes, shifting in zero bytes, and returns the result. If
2321/// \a imm is greater than 15, the returned result is all zeroes.
2322///
2323/// \headerfile <immintrin.h>
2324///
2325/// \code
2326/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2327/// \endcode
2328///
2329/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2330///
2331/// \param a
2332/// A 256-bit integer vector to be shifted.
2333/// \param imm
2334/// An unsigned immediate value specifying the shift count (in bytes).
2335/// \returns A 256-bit integer vector containing the result.
2336#define _mm256_srli_si256(a, imm) \
2337 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2338
2339/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2340/// \a imm bytes, shifting in zero bytes, and returns the result. If
2341/// \a imm is greater than 15, the returned result is all zeroes.
2342///
2343/// \headerfile <immintrin.h>
2344///
2345/// \code
2346/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2347/// \endcode
2348///
2349/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2350///
2351/// \param a
2352/// A 256-bit integer vector to be shifted.
2353/// \param imm
2354/// An unsigned immediate value specifying the shift count (in bytes).
2355/// \returns A 256-bit integer vector containing the result.
2356#define _mm256_bsrli_epi128(a, imm) \
2357 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2358
2359/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2360/// right by \a __count bits, shifting in zero bits, and returns the result.
2361/// If \a __count is greater than 15, the returned result is all zeroes.
2362///
2363/// \headerfile <immintrin.h>
2364///
2365/// This intrinsic corresponds to the \c VPSRLW instruction.
2366///
2367/// \param __a
2368/// A 256-bit vector of [16 x i16] to be shifted.
2369/// \param __count
2370/// An unsigned integer value specifying the shift count (in bits).
2371/// \returns A 256-bit vector of [16 x i16] containing the result.
2372static __inline__ __m256i __DEFAULT_FN_ATTRS256
2373_mm256_srli_epi16(__m256i __a, int __count)
2374{
2375 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2376}
2377
2378/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2379/// right by the number of bits given in the lower 64 bits of \a __count,
2380/// shifting in zero bits, and returns the result. If \a __count is greater
2381/// than 15, the returned result is all zeroes.
2382///
2383/// \headerfile <immintrin.h>
2384///
2385/// This intrinsic corresponds to the \c VPSRLW instruction.
2386///
2387/// \param __a
2388/// A 256-bit vector of [16 x i16] to be shifted.
2389/// \param __count
2390/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2391/// shift count (in bits). The upper element is ignored.
2392/// \returns A 256-bit vector of [16 x i16] containing the result.
2393static __inline__ __m256i __DEFAULT_FN_ATTRS256
2394_mm256_srl_epi16(__m256i __a, __m128i __count)
2395{
2396 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2397}
2398
2399/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2400/// right by \a __count bits, shifting in zero bits, and returns the result.
2401/// If \a __count is greater than 31, the returned result is all zeroes.
2402///
2403/// \headerfile <immintrin.h>
2404///
2405/// This intrinsic corresponds to the \c VPSRLD instruction.
2406///
2407/// \param __a
2408/// A 256-bit vector of [8 x i32] to be shifted.
2409/// \param __count
2410/// An unsigned integer value specifying the shift count (in bits).
2411/// \returns A 256-bit vector of [8 x i32] containing the result.
2412static __inline__ __m256i __DEFAULT_FN_ATTRS256
2413_mm256_srli_epi32(__m256i __a, int __count)
2414{
2415 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2416}
2417
2418/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2419/// right by the number of bits given in the lower 64 bits of \a __count,
2420/// shifting in zero bits, and returns the result. If \a __count is greater
2421/// than 31, the returned result is all zeroes.
2422///
2423/// \headerfile <immintrin.h>
2424///
2425/// This intrinsic corresponds to the \c VPSRLD instruction.
2426///
2427/// \param __a
2428/// A 256-bit vector of [8 x i32] to be shifted.
2429/// \param __count
2430/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2431/// shift count (in bits). The upper element is ignored.
2432/// \returns A 256-bit vector of [8 x i32] containing the result.
2433static __inline__ __m256i __DEFAULT_FN_ATTRS256
2434_mm256_srl_epi32(__m256i __a, __m128i __count)
2435{
2436 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2437}
2438
2439/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2440/// right by \a __count bits, shifting in zero bits, and returns the result.
2441/// If \a __count is greater than 63, the returned result is all zeroes.
2442///
2443/// \headerfile <immintrin.h>
2444///
2445/// This intrinsic corresponds to the \c VPSRLQ instruction.
2446///
2447/// \param __a
2448/// A 256-bit vector of [4 x i64] to be shifted.
2449/// \param __count
2450/// An unsigned integer value specifying the shift count (in bits).
2451/// \returns A 256-bit vector of [4 x i64] containing the result.
2452static __inline__ __m256i __DEFAULT_FN_ATTRS256
2453_mm256_srli_epi64(__m256i __a, int __count)
2454{
2455 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2456}
2457
2458/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2459/// right by the number of bits given in the lower 64 bits of \a __count,
2460/// shifting in zero bits, and returns the result. If \a __count is greater
2461/// than 63, the returned result is all zeroes.
2462///
2463/// \headerfile <immintrin.h>
2464///
2465/// This intrinsic corresponds to the \c VPSRLQ instruction.
2466///
2467/// \param __a
2468/// A 256-bit vector of [4 x i64] to be shifted.
2469/// \param __count
2470/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2471/// shift count (in bits). The upper element is ignored.
2472/// \returns A 256-bit vector of [4 x i64] containing the result.
2473static __inline__ __m256i __DEFAULT_FN_ATTRS256
2474_mm256_srl_epi64(__m256i __a, __m128i __count)
2475{
2476 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2477}
2478
2479/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2480/// vectors. Returns the lower 8 bits of each difference in the
2481/// corresponding byte of the 256-bit integer vector result (overflow is
2482/// ignored).
2483///
2484/// \code{.operation}
2485/// FOR i := 0 TO 31
2486/// j := i*8
2487/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2488/// ENDFOR
2489/// \endcode
2490///
2491/// \headerfile <immintrin.h>
2492///
2493/// This intrinsic corresponds to the \c VPSUBB instruction.
2494///
2495/// \param __a
2496/// A 256-bit integer vector containing the minuends.
2497/// \param __b
2498/// A 256-bit integer vector containing the subtrahends.
2499/// \returns A 256-bit integer vector containing the differences.
2500static __inline__ __m256i __DEFAULT_FN_ATTRS256
2501_mm256_sub_epi8(__m256i __a, __m256i __b)
2502{
2503 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2504}
2505
2506/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2507/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2508/// the corresponding element of the [16 x i16] result (overflow is
2509/// ignored).
2510///
2511/// \code{.operation}
2512/// FOR i := 0 TO 15
2513/// j := i*16
2514/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2515/// ENDFOR
2516/// \endcode
2517///
2518/// \headerfile <immintrin.h>
2519///
2520/// This intrinsic corresponds to the \c VPSUBW instruction.
2521///
2522/// \param __a
2523/// A 256-bit vector of [16 x i16] containing the minuends.
2524/// \param __b
2525/// A 256-bit vector of [16 x i16] containing the subtrahends.
2526/// \returns A 256-bit vector of [16 x i16] containing the differences.
2527static __inline__ __m256i __DEFAULT_FN_ATTRS256
2528_mm256_sub_epi16(__m256i __a, __m256i __b)
2529{
2530 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2531}
2532
2533/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2534/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2535/// the corresponding element of the [8 x i32] result (overflow is ignored).
2536///
2537/// \code{.operation}
2538/// FOR i := 0 TO 7
2539/// j := i*32
2540/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2541/// ENDFOR
2542/// \endcode
2543///
2544/// \headerfile <immintrin.h>
2545///
2546/// This intrinsic corresponds to the \c VPSUBD instruction.
2547///
2548/// \param __a
2549/// A 256-bit vector of [8 x i32] containing the minuends.
2550/// \param __b
2551/// A 256-bit vector of [8 x i32] containing the subtrahends.
2552/// \returns A 256-bit vector of [8 x i32] containing the differences.
2553static __inline__ __m256i __DEFAULT_FN_ATTRS256
2554_mm256_sub_epi32(__m256i __a, __m256i __b)
2555{
2556 return (__m256i)((__v8su)__a - (__v8su)__b);
2557}
2558
2559/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2560/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2561/// the corresponding element of the [4 x i64] result (overflow is ignored).
2562///
2563/// \code{.operation}
2564/// FOR i := 0 TO 3
2565/// j := i*64
2566/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2567/// ENDFOR
2568/// \endcode
2569///
2570/// \headerfile <immintrin.h>
2571///
2572/// This intrinsic corresponds to the \c VPSUBQ instruction.
2573///
2574/// \param __a
2575/// A 256-bit vector of [4 x i64] containing the minuends.
2576/// \param __b
2577/// A 256-bit vector of [4 x i64] containing the subtrahends.
2578/// \returns A 256-bit vector of [4 x i64] containing the differences.
2579static __inline__ __m256i __DEFAULT_FN_ATTRS256
2580_mm256_sub_epi64(__m256i __a, __m256i __b)
2581{
2582 return (__m256i)((__v4du)__a - (__v4du)__b);
2583}
2584
2585/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2586/// vectors using signed saturation, and returns each differences in the
2587/// corresponding byte of the 256-bit integer vector result.
2588///
2589/// \code{.operation}
2590/// FOR i := 0 TO 31
2591/// j := i*8
2592/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2593/// ENDFOR
2594/// \endcode
2595///
2596/// \headerfile <immintrin.h>
2597///
2598/// This intrinsic corresponds to the \c VPSUBSB instruction.
2599///
2600/// \param __a
2601/// A 256-bit integer vector containing the minuends.
2602/// \param __b
2603/// A 256-bit integer vector containing the subtrahends.
2604/// \returns A 256-bit integer vector containing the differences.
2605static __inline__ __m256i __DEFAULT_FN_ATTRS256
2606_mm256_subs_epi8(__m256i __a, __m256i __b)
2607{
2608 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2609}
2610
2611/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2612/// vectors of [16 x i16] using signed saturation, and returns each
2613/// difference in the corresponding element of the [16 x i16] result.
2614///
2615/// \code{.operation}
2616/// FOR i := 0 TO 15
2617/// j := i*16
2618/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2619/// ENDFOR
2620/// \endcode
2621///
2622/// \headerfile <immintrin.h>
2623///
2624/// This intrinsic corresponds to the \c VPSUBSW instruction.
2625///
2626/// \param __a
2627/// A 256-bit vector of [16 x i16] containing the minuends.
2628/// \param __b
2629/// A 256-bit vector of [16 x i16] containing the subtrahends.
2630/// \returns A 256-bit vector of [16 x i16] containing the differences.
2631static __inline__ __m256i __DEFAULT_FN_ATTRS256
2632_mm256_subs_epi16(__m256i __a, __m256i __b)
2633{
2634 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2635}
2636
2637/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2638/// vectors using unsigned saturation, and returns each difference in the
2639/// corresponding byte of the 256-bit integer vector result. For each byte,
2640/// computes <c> result = __a - __b </c>.
2641///
2642/// \code{.operation}
2643/// FOR i := 0 TO 31
2644/// j := i*8
2645/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2646/// ENDFOR
2647/// \endcode
2648///
2649/// \headerfile <immintrin.h>
2650///
2651/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2652///
2653/// \param __a
2654/// A 256-bit integer vector containing the minuends.
2655/// \param __b
2656/// A 256-bit integer vector containing the subtrahends.
2657/// \returns A 256-bit integer vector containing the differences.
2658static __inline__ __m256i __DEFAULT_FN_ATTRS256
2659_mm256_subs_epu8(__m256i __a, __m256i __b)
2660{
2661 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2662}
2663
2664/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2665/// vectors of [16 x i16] using unsigned saturation, and returns each
2666/// difference in the corresponding element of the [16 x i16] result.
2667///
2668/// \code{.operation}
2669/// FOR i := 0 TO 15
2670/// j := i*16
2671/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2672/// ENDFOR
2673/// \endcode
2674///
2675/// \headerfile <immintrin.h>
2676///
2677/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2678///
2679/// \param __a
2680/// A 256-bit vector of [16 x i16] containing the minuends.
2681/// \param __b
2682/// A 256-bit vector of [16 x i16] containing the subtrahends.
2683/// \returns A 256-bit vector of [16 x i16] containing the differences.
2684static __inline__ __m256i __DEFAULT_FN_ATTRS256
2685_mm256_subs_epu16(__m256i __a, __m256i __b)
2686{
2687 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2688}
2689
2690/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2691/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2692/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2693/// input; other bits in these parameters are ignored.
2694///
2695/// \code{.operation}
2696/// result[7:0] := __a[71:64]
2697/// result[15:8] := __b[71:64]
2698/// result[23:16] := __a[79:72]
2699/// result[31:24] := __b[79:72]
2700/// . . .
2701/// result[127:120] := __b[127:120]
2702/// result[135:128] := __a[199:192]
2703/// . . .
2704/// result[255:248] := __b[255:248]
2705/// \endcode
2706///
2707/// \headerfile <immintrin.h>
2708///
2709/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2710///
2711/// \param __a
2712/// A 256-bit integer vector used as the source for the even-numbered bytes
2713/// of the result.
2714/// \param __b
2715/// A 256-bit integer vector used as the source for the odd-numbered bytes
2716/// of the result.
2717/// \returns A 256-bit integer vector containing the result.
2718static __inline__ __m256i __DEFAULT_FN_ATTRS256
2720{
2721 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2722}
2723
2724/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2725/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2726/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2727/// 128-bit half of \a __a and \a __b as input; other bits in these
2728/// parameters are ignored.
2729///
2730/// \code{.operation}
2731/// result[15:0] := __a[79:64]
2732/// result[31:16] := __b[79:64]
2733/// result[47:32] := __a[95:80]
2734/// result[63:48] := __b[95:80]
2735/// . . .
2736/// result[127:112] := __b[127:112]
2737/// result[143:128] := __a[211:196]
2738/// . . .
2739/// result[255:240] := __b[255:240]
2740/// \endcode
2741///
2742/// \headerfile <immintrin.h>
2743///
2744/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2745///
2746/// \param __a
2747/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2748/// elements of the result.
2749/// \param __b
2750/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2751/// elements of the result.
2752/// \returns A 256-bit vector of [16 x i16] containing the result.
2753static __inline__ __m256i __DEFAULT_FN_ATTRS256
2755{
2756 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2757}
2758
2759/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2760/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2761/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2762/// of \a __a and \a __b as input; other bits in these parameters are
2763/// ignored.
2764///
2765/// \code{.operation}
2766/// result[31:0] := __a[95:64]
2767/// result[63:32] := __b[95:64]
2768/// result[95:64] := __a[127:96]
2769/// result[127:96] := __b[127:96]
2770/// result[159:128] := __a[223:192]
2771/// result[191:160] := __b[223:192]
2772/// result[223:192] := __a[255:224]
2773/// result[255:224] := __b[255:224]
2774/// \endcode
2775///
2776/// \headerfile <immintrin.h>
2777///
2778/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2779///
2780/// \param __a
2781/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2782/// elements of the result.
2783/// \param __b
2784/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2785/// elements of the result.
2786/// \returns A 256-bit vector of [8 x i32] containing the result.
2787static __inline__ __m256i __DEFAULT_FN_ATTRS256
2789{
2790 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2791}
2792
2793/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2794/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2795/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2796/// of \a __a and \a __b as input; other bits in these parameters are
2797/// ignored.
2798///
2799/// \code{.operation}
2800/// result[63:0] := __a[127:64]
2801/// result[127:64] := __b[127:64]
2802/// result[191:128] := __a[255:192]
2803/// result[255:192] := __b[255:192]
2804/// \endcode
2805///
2806/// \headerfile <immintrin.h>
2807///
2808/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2809///
2810/// \param __a
2811/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2812/// elements of the result.
2813/// \param __b
2814/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2815/// elements of the result.
2816/// \returns A 256-bit vector of [4 x i64] containing the result.
2817static __inline__ __m256i __DEFAULT_FN_ATTRS256
2819{
2820 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2821}
2822
2823/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2824/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2825/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2826/// input; other bits in these parameters are ignored.
2827///
2828/// \code{.operation}
2829/// result[7:0] := __a[7:0]
2830/// result[15:8] := __b[7:0]
2831/// result[23:16] := __a[15:8]
2832/// result[31:24] := __b[15:8]
2833/// . . .
2834/// result[127:120] := __b[63:56]
2835/// result[135:128] := __a[135:128]
2836/// . . .
2837/// result[255:248] := __b[191:184]
2838/// \endcode
2839///
2840/// \headerfile <immintrin.h>
2841///
2842/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2843///
2844/// \param __a
2845/// A 256-bit integer vector used as the source for the even-numbered bytes
2846/// of the result.
2847/// \param __b
2848/// A 256-bit integer vector used as the source for the odd-numbered bytes
2849/// of the result.
2850/// \returns A 256-bit integer vector containing the result.
2851static __inline__ __m256i __DEFAULT_FN_ATTRS256
2853{
2854 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2855}
2856
2857/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2858/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2859/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2860/// 128-bit half of \a __a and \a __b as input; other bits in these
2861/// parameters are ignored.
2862///
2863/// \code{.operation}
2864/// result[15:0] := __a[15:0]
2865/// result[31:16] := __b[15:0]
2866/// result[47:32] := __a[31:16]
2867/// result[63:48] := __b[31:16]
2868/// . . .
2869/// result[127:112] := __b[63:48]
2870/// result[143:128] := __a[143:128]
2871/// . . .
2872/// result[255:239] := __b[191:176]
2873/// \endcode
2874///
2875/// \headerfile <immintrin.h>
2876///
2877/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2878///
2879/// \param __a
2880/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2881/// elements of the result.
2882/// \param __b
2883/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2884/// elements of the result.
2885/// \returns A 256-bit vector of [16 x i16] containing the result.
2886static __inline__ __m256i __DEFAULT_FN_ATTRS256
2888{
2889 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2890}
2891
2892/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2893/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2894/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2895/// of \a __a and \a __b as input; other bits in these parameters are
2896/// ignored.
2897///
2898/// \code{.operation}
2899/// result[31:0] := __a[31:0]
2900/// result[63:32] := __b[31:0]
2901/// result[95:64] := __a[63:32]
2902/// result[127:96] := __b[63:32]
2903/// result[159:128] := __a[159:128]
2904/// result[191:160] := __b[159:128]
2905/// result[223:192] := __a[191:160]
2906/// result[255:224] := __b[191:190]
2907/// \endcode
2908///
2909/// \headerfile <immintrin.h>
2910///
2911/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2912///
2913/// \param __a
2914/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2915/// elements of the result.
2916/// \param __b
2917/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2918/// elements of the result.
2919/// \returns A 256-bit vector of [8 x i32] containing the result.
2920static __inline__ __m256i __DEFAULT_FN_ATTRS256
2922{
2923 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2924}
2925
2926/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2927/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2928/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2929/// of \a __a and \a __b as input; other bits in these parameters are
2930/// ignored.
2931///
2932/// \code{.operation}
2933/// result[63:0] := __a[63:0]
2934/// result[127:64] := __b[63:0]
2935/// result[191:128] := __a[191:128]
2936/// result[255:192] := __b[191:128]
2937/// \endcode
2938///
2939/// \headerfile <immintrin.h>
2940///
2941/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2942///
2943/// \param __a
2944/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2945/// elements of the result.
2946/// \param __b
2947/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2948/// elements of the result.
2949/// \returns A 256-bit vector of [4 x i64] containing the result.
2950static __inline__ __m256i __DEFAULT_FN_ATTRS256
2952{
2953 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2954}
2955
2956/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2957/// \a __b.
2958///
2959/// \headerfile <immintrin.h>
2960///
2961/// This intrinsic corresponds to the \c VPXOR instruction.
2962///
2963/// \param __a
2964/// A 256-bit integer vector.
2965/// \param __b
2966/// A 256-bit integer vector.
2967/// \returns A 256-bit integer vector containing the result.
2968static __inline__ __m256i __DEFAULT_FN_ATTRS256
2969_mm256_xor_si256(__m256i __a, __m256i __b)
2970{
2971 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2972}
2973
2974/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2975/// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2976/// boundary.
2977///
2978/// \headerfile <immintrin.h>
2979///
2980/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2981///
2982/// \param __V
2983/// A pointer to the 32-byte aligned memory containing the vector to load.
2984/// \returns A 256-bit integer vector loaded from memory.
2985static __inline__ __m256i __DEFAULT_FN_ATTRS256
2987{
2988 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2989 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2990}
2991
2992/// Broadcasts the 32-bit floating-point value from the low element of the
2993/// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2994/// 128-bit vector of [4 x float].
2995///
2996/// \headerfile <immintrin.h>
2997///
2998/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2999///
3000/// \param __X
3001/// A 128-bit vector of [4 x float] whose low element will be broadcast.
3002/// \returns A 128-bit vector of [4 x float] containing the result.
3003static __inline__ __m128 __DEFAULT_FN_ATTRS128
3005{
3006 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3007}
3008
3009/// Broadcasts the 64-bit floating-point value from the low element of the
3010/// 128-bit vector of [2 x double] in \a __a to both elements of the
3011/// result's 128-bit vector of [2 x double].
3012///
3013/// \headerfile <immintrin.h>
3014///
3015/// This intrinsic corresponds to the \c MOVDDUP instruction.
3016///
3017/// \param __a
3018/// A 128-bit vector of [2 x double] whose low element will be broadcast.
3019/// \returns A 128-bit vector of [2 x double] containing the result.
3020static __inline__ __m128d __DEFAULT_FN_ATTRS128
3022{
3023 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3024}
3025
3026/// Broadcasts the 32-bit floating-point value from the low element of the
3027/// 128-bit vector of [4 x float] in \a __X to all elements of the
3028/// result's 256-bit vector of [8 x float].
3029///
3030/// \headerfile <immintrin.h>
3031///
3032/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3033///
3034/// \param __X
3035/// A 128-bit vector of [4 x float] whose low element will be broadcast.
3036/// \returns A 256-bit vector of [8 x float] containing the result.
3037static __inline__ __m256 __DEFAULT_FN_ATTRS256
3039{
3040 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3041}
3042
3043/// Broadcasts the 64-bit floating-point value from the low element of the
3044/// 128-bit vector of [2 x double] in \a __X to all elements of the
3045/// result's 256-bit vector of [4 x double].
3046///
3047/// \headerfile <immintrin.h>
3048///
3049/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3050///
3051/// \param __X
3052/// A 128-bit vector of [2 x double] whose low element will be broadcast.
3053/// \returns A 256-bit vector of [4 x double] containing the result.
3054static __inline__ __m256d __DEFAULT_FN_ATTRS256
3056{
3057 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3058}
3059
3060/// Broadcasts the 128-bit integer data from \a __X to both the lower and
3061/// upper halves of the 256-bit result.
3062///
3063/// \headerfile <immintrin.h>
3064///
3065/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3066///
3067/// \param __X
3068/// A 128-bit integer vector to be broadcast.
3069/// \returns A 256-bit integer vector containing the result.
3070static __inline__ __m256i __DEFAULT_FN_ATTRS256
3072{
3073 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3074}
3075
3076#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3077
3078/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3079/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3080/// as specified by the immediate integer operand \a M.
3081///
3082/// \code{.operation}
3083/// FOR i := 0 TO 3
3084/// j := i*32
3085/// IF M[i] == 0
3086/// result[31+j:j] := V1[31+j:j]
3087/// ELSE
3088/// result[31+j:j] := V2[32+j:j]
3089/// FI
3090/// ENDFOR
3091/// \endcode
3092///
3093/// \headerfile <immintrin.h>
3094///
3095/// \code
3096/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3097/// \endcode
3098///
3099/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3100///
3101/// \param V1
3102/// A 128-bit vector of [4 x i32] containing source values.
3103/// \param V2
3104/// A 128-bit vector of [4 x i32] containing source values.
3105/// \param M
3106/// An immediate 8-bit integer operand, with bits [3:0] specifying the
3107/// source for each element of the result. The position of the mask bit
3108/// corresponds to the index of a copied value. When a mask bit is 0, the
3109/// element is copied from \a V1; otherwise, it is copied from \a V2.
3110/// \returns A 128-bit vector of [4 x i32] containing the result.
3111#define _mm_blend_epi32(V1, V2, M) \
3112 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3113 (__v4si)(__m128i)(V2), (int)(M)))
3114
3115/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3116/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3117/// as specified by the immediate integer operand \a M.
3118///
3119/// \code{.operation}
3120/// FOR i := 0 TO 7
3121/// j := i*32
3122/// IF M[i] == 0
3123/// result[31+j:j] := V1[31+j:j]
3124/// ELSE
3125/// result[31+j:j] := V2[32+j:j]
3126/// FI
3127/// ENDFOR
3128/// \endcode
3129///
3130/// \headerfile <immintrin.h>
3131///
3132/// \code
3133/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3134/// \endcode
3135///
3136/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3137///
3138/// \param V1
3139/// A 256-bit vector of [8 x i32] containing source values.
3140/// \param V2
3141/// A 256-bit vector of [8 x i32] containing source values.
3142/// \param M
3143/// An immediate 8-bit integer operand, with bits [7:0] specifying the
3144/// source for each element of the result. The position of the mask bit
3145/// corresponds to the index of a copied value. When a mask bit is 0, the
3146/// element is copied from \a V1; otherwise, it is is copied from \a V2.
3147/// \returns A 256-bit vector of [8 x i32] containing the result.
3148#define _mm256_blend_epi32(V1, V2, M) \
3149 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3150 (__v8si)(__m256i)(V2), (int)(M)))
3151
3152/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3153/// bytes of the 256-bit result.
3154///
3155/// \headerfile <immintrin.h>
3156///
3157/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3158///
3159/// \param __X
3160/// A 128-bit integer vector whose low byte will be broadcast.
3161/// \returns A 256-bit integer vector containing the result.
3162static __inline__ __m256i __DEFAULT_FN_ATTRS256
3164{
3165 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3166}
3167
3168/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3169/// to all elements of the result's 256-bit vector of [16 x i16].
3170///
3171/// \headerfile <immintrin.h>
3172///
3173/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3174///
3175/// \param __X
3176/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3177/// \returns A 256-bit vector of [16 x i16] containing the result.
3178static __inline__ __m256i __DEFAULT_FN_ATTRS256
3180{
3181 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3182}
3183
3184/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3185/// to all elements of the result's 256-bit vector of [8 x i32].
3186///
3187/// \headerfile <immintrin.h>
3188///
3189/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3190///
3191/// \param __X
3192/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3193/// \returns A 256-bit vector of [8 x i32] containing the result.
3194static __inline__ __m256i __DEFAULT_FN_ATTRS256
3196{
3197 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3198}
3199
3200/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3201/// to all elements of the result's 256-bit vector of [4 x i64].
3202///
3203/// \headerfile <immintrin.h>
3204///
3205/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3206///
3207/// \param __X
3208/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3209/// \returns A 256-bit vector of [4 x i64] containing the result.
3210static __inline__ __m256i __DEFAULT_FN_ATTRS256
3212{
3213 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3214}
3215
3216/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3217/// bytes of the 128-bit result.
3218///
3219/// \headerfile <immintrin.h>
3220///
3221/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3222///
3223/// \param __X
3224/// A 128-bit integer vector whose low byte will be broadcast.
3225/// \returns A 128-bit integer vector containing the result.
3226static __inline__ __m128i __DEFAULT_FN_ATTRS128
3228{
3229 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3230}
3231
3232/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3233/// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3234///
3235/// \headerfile <immintrin.h>
3236///
3237/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3238///
3239/// \param __X
3240/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3241/// \returns A 128-bit vector of [8 x i16] containing the result.
3242static __inline__ __m128i __DEFAULT_FN_ATTRS128
3244{
3245 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3246}
3247
3248/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3249/// to all elements of the result's vector of [4 x i32].
3250///
3251/// \headerfile <immintrin.h>
3252///
3253/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3254///
3255/// \param __X
3256/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3257/// \returns A 128-bit vector of [4 x i32] containing the result.
3258static __inline__ __m128i __DEFAULT_FN_ATTRS128
3260{
3261 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3262}
3263
3264/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3265/// to both elements of the result's 128-bit vector of [2 x i64].
3266///
3267/// \headerfile <immintrin.h>
3268///
3269/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3270///
3271/// \param __X
3272/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3273/// \returns A 128-bit vector of [2 x i64] containing the result.
3274static __inline__ __m128i __DEFAULT_FN_ATTRS128
3276{
3277 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3278}
3279
3280/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3281/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3282/// elements of the 256-bit vector of [8 x i32] in \a __b.
3283///
3284/// \code{.operation}
3285/// FOR i := 0 TO 7
3286/// j := i*32
3287/// k := __b[j+2:j] * 32
3288/// result[j+31:j] := __a[k+31:k]
3289/// ENDFOR
3290/// \endcode
3291///
3292/// \headerfile <immintrin.h>
3293///
3294/// This intrinsic corresponds to the \c VPERMD instruction.
3295///
3296/// \param __a
3297/// A 256-bit vector of [8 x i32] containing the source values.
3298/// \param __b
3299/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3300/// \a __a.
3301/// \returns A 256-bit vector of [8 x i32] containing the result.
3302static __inline__ __m256i __DEFAULT_FN_ATTRS256
3304{
3305 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3306}
3307
3308/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3309/// the 256-bit vector of [4 x double] in \a V as specified by the
3310/// immediate value \a M.
3311///
3312/// \code{.operation}
3313/// FOR i := 0 TO 3
3314/// j := i*64
3315/// k := (M >> i*2)[1:0] * 64
3316/// result[j+63:j] := V[k+63:k]
3317/// ENDFOR
3318/// \endcode
3319///
3320/// \headerfile <immintrin.h>
3321///
3322/// \code
3323/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3324/// \endcode
3325///
3326/// This intrinsic corresponds to the \c VPERMPD instruction.
3327///
3328/// \param V
3329/// A 256-bit vector of [4 x double] containing the source values.
3330/// \param M
3331/// An immediate 8-bit value specifying which elements to copy from \a V.
3332/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3333/// \a M[3:2] specifies the index for element 1, and so forth.
3334/// \returns A 256-bit vector of [4 x double] containing the result.
3335#define _mm256_permute4x64_pd(V, M) \
3336 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3337
3338/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3339/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3340/// the elements of the 256-bit vector of [8 x i32] in \a __b.
3341///
3342/// \code{.operation}
3343/// FOR i := 0 TO 7
3344/// j := i*32
3345/// k := __b[j+2:j] * 32
3346/// result[j+31:j] := __a[k+31:k]
3347/// ENDFOR
3348/// \endcode
3349///
3350/// \headerfile <immintrin.h>
3351///
3352/// This intrinsic corresponds to the \c VPERMPS instruction.
3353///
3354/// \param __a
3355/// A 256-bit vector of [8 x float] containing the source values.
3356/// \param __b
3357/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3358/// \a __a.
3359/// \returns A 256-bit vector of [8 x float] containing the result.
3360static __inline__ __m256 __DEFAULT_FN_ATTRS256
3362{
3363 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3364}
3365
3366/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3367/// of the 256-bit vector of [4 x i64] in \a V as specified by the
3368/// immediate value \a M.
3369///
3370/// \code{.operation}
3371/// FOR i := 0 TO 3
3372/// j := i*64
3373/// k := (M >> i*2)[1:0] * 64
3374/// result[j+63:j] := V[k+63:k]
3375/// ENDFOR
3376/// \endcode
3377///
3378/// \headerfile <immintrin.h>
3379///
3380/// \code
3381/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3382/// \endcode
3383///
3384/// This intrinsic corresponds to the \c VPERMQ instruction.
3385///
3386/// \param V
3387/// A 256-bit vector of [4 x i64] containing the source values.
3388/// \param M
3389/// An immediate 8-bit value specifying which elements to copy from \a V.
3390/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3391/// \a M[3:2] specifies the index for element 1, and so forth.
3392/// \returns A 256-bit vector of [4 x i64] containing the result.
3393#define _mm256_permute4x64_epi64(V, M) \
3394 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3395
3396/// Sets each half of the 256-bit result either to zero or to one of the
3397/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3398/// as specified by the immediate value \a M.
3399///
3400/// \code{.operation}
3401/// FOR i := 0 TO 1
3402/// j := i*128
3403/// k := M >> (i*4)
3404/// IF k[3] == 0
3405/// CASE (k[1:0]) OF
3406/// 0: result[127+j:j] := V1[127:0]
3407/// 1: result[127+j:j] := V1[255:128]
3408/// 2: result[127+j:j] := V2[127:0]
3409/// 3: result[127+j:j] := V2[255:128]
3410/// ESAC
3411/// ELSE
3412/// result[127+j:j] := 0
3413/// FI
3414/// ENDFOR
3415/// \endcode
3416///
3417/// \headerfile <immintrin.h>
3418///
3419/// \code
3420/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3421/// \endcode
3422///
3423/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3424///
3425/// \param V1
3426/// A 256-bit integer vector containing source values.
3427/// \param V2
3428/// A 256-bit integer vector containing source values.
3429/// \param M
3430/// An immediate value specifying how to form the result. Bits [3:0]
3431/// control the lower half of the result, bits [7:4] control the upper half.
3432/// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3433/// otherwise bits [1:0] determine the source as follows. \n
3434/// 0: the lower half of \a V1 \n
3435/// 1: the upper half of \a V1 \n
3436/// 2: the lower half of \a V2 \n
3437/// 3: the upper half of \a V2
3438/// \returns A 256-bit integer vector containing the result.
3439#define _mm256_permute2x128_si256(V1, V2, M) \
3440 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3441
3442/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3443/// of the immediate \a M is zero, extracts the lower half of the result;
3444/// otherwise, extracts the upper half.
3445///
3446/// \headerfile <immintrin.h>
3447///
3448/// \code
3449/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3450/// \endcode
3451///
3452/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3453///
3454/// \param V
3455/// A 256-bit integer vector containing the source values.
3456/// \param M
3457/// An immediate value specifying which half of \a V to extract.
3458/// \returns A 128-bit integer vector containing the result.
3459#define _mm256_extracti128_si256(V, M) \
3460 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3461
3462/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3463/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3464/// is zero, overwrites the lower half of the result; otherwise,
3465/// overwrites the upper half.
3466///
3467/// \headerfile <immintrin.h>
3468///
3469/// \code
3470/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3471/// \endcode
3472///
3473/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3474///
3475/// \param V1
3476/// A 256-bit integer vector containing a source value.
3477/// \param V2
3478/// A 128-bit integer vector containing a source value.
3479/// \param M
3480/// An immediate value specifying where to put \a V2 in the result.
3481/// \returns A 256-bit integer vector containing the result.
3482#define _mm256_inserti128_si256(V1, V2, M) \
3483 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3484 (__v2di)(__m128i)(V2), (int)(M)))
3485
3486/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3487/// the most significant bit of the corresponding element in the mask
3488/// \a __M is set; otherwise, sets that element of the result to zero.
3489/// Returns the 256-bit [8 x i32] result.
3490///
3491/// \code{.operation}
3492/// FOR i := 0 TO 7
3493/// j := i*32
3494/// IF __M[j+31] == 1
3495/// result[j+31:j] := Load32(__X+(i*4))
3496/// ELSE
3497/// result[j+31:j] := 0
3498/// FI
3499/// ENDFOR
3500/// \endcode
3501///
3502/// \headerfile <immintrin.h>
3503///
3504/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3505///
3506/// \param __X
3507/// A pointer to the memory used for loading values.
3508/// \param __M
3509/// A 256-bit vector of [8 x i32] containing the mask bits.
3510/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3511/// elements.
3512static __inline__ __m256i __DEFAULT_FN_ATTRS256
3513_mm256_maskload_epi32(int const *__X, __m256i __M)
3514{
3515 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3516}
3517
3518/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3519/// the most significant bit of the corresponding element in the mask
3520/// \a __M is set; otherwise, sets that element of the result to zero.
3521/// Returns the 256-bit [4 x i64] result.
3522///
3523/// \code{.operation}
3524/// FOR i := 0 TO 3
3525/// j := i*64
3526/// IF __M[j+63] == 1
3527/// result[j+63:j] := Load64(__X+(i*8))
3528/// ELSE
3529/// result[j+63:j] := 0
3530/// FI
3531/// ENDFOR
3532/// \endcode
3533///
3534/// \headerfile <immintrin.h>
3535///
3536/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3537///
3538/// \param __X
3539/// A pointer to the memory used for loading values.
3540/// \param __M
3541/// A 256-bit vector of [4 x i64] containing the mask bits.
3542/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3543/// elements.
3544static __inline__ __m256i __DEFAULT_FN_ATTRS256
3545_mm256_maskload_epi64(long long const *__X, __m256i __M)
3546{
3547 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3548}
3549
3550/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3551/// the most significant bit of the corresponding element in the mask
3552/// \a __M is set; otherwise, sets that element of the result to zero.
3553/// Returns the 128-bit [4 x i32] result.
3554///
3555/// \code{.operation}
3556/// FOR i := 0 TO 3
3557/// j := i*32
3558/// IF __M[j+31] == 1
3559/// result[j+31:j] := Load32(__X+(i*4))
3560/// ELSE
3561/// result[j+31:j] := 0
3562/// FI
3563/// ENDFOR
3564/// \endcode
3565///
3566/// \headerfile <immintrin.h>
3567///
3568/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3569///
3570/// \param __X
3571/// A pointer to the memory used for loading values.
3572/// \param __M
3573/// A 128-bit vector of [4 x i32] containing the mask bits.
3574/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3575/// elements.
3576static __inline__ __m128i __DEFAULT_FN_ATTRS128
3577_mm_maskload_epi32(int const *__X, __m128i __M)
3578{
3579 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3580}
3581
3582/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3583/// the most significant bit of the corresponding element in the mask
3584/// \a __M is set; otherwise, sets that element of the result to zero.
3585/// Returns the 128-bit [2 x i64] result.
3586///
3587/// \code{.operation}
3588/// FOR i := 0 TO 1
3589/// j := i*64
3590/// IF __M[j+63] == 1
3591/// result[j+63:j] := Load64(__X+(i*8))
3592/// ELSE
3593/// result[j+63:j] := 0
3594/// FI
3595/// ENDFOR
3596/// \endcode
3597///
3598/// \headerfile <immintrin.h>
3599///
3600/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3601///
3602/// \param __X
3603/// A pointer to the memory used for loading values.
3604/// \param __M
3605/// A 128-bit vector of [2 x i64] containing the mask bits.
3606/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3607/// elements.
3608static __inline__ __m128i __DEFAULT_FN_ATTRS128
3609_mm_maskload_epi64(long long const *__X, __m128i __M)
3610{
3611 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3612}
3613
3614/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3615/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3616/// the corresponding element in the mask \a __M is set; otherwise, the
3617/// memory element is unchanged.
3618///
3619/// \code{.operation}
3620/// FOR i := 0 TO 7
3621/// j := i*32
3622/// IF __M[j+31] == 1
3623/// Store32(__X+(i*4), __Y[j+31:j])
3624/// FI
3625/// ENDFOR
3626/// \endcode
3627///
3628/// \headerfile <immintrin.h>
3629///
3630/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3631///
3632/// \param __X
3633/// A pointer to the memory used for storing values.
3634/// \param __M
3635/// A 256-bit vector of [8 x i32] containing the mask bits.
3636/// \param __Y
3637/// A 256-bit vector of [8 x i32] containing the values to store.
3638static __inline__ void __DEFAULT_FN_ATTRS256
3639_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3640{
3641 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3642}
3643
3644/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3645/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3646/// the corresponding element in the mask \a __M is set; otherwise, the
3647/// memory element is unchanged.
3648///
3649/// \code{.operation}
3650/// FOR i := 0 TO 3
3651/// j := i*64
3652/// IF __M[j+63] == 1
3653/// Store64(__X+(i*8), __Y[j+63:j])
3654/// FI
3655/// ENDFOR
3656/// \endcode
3657///
3658/// \headerfile <immintrin.h>
3659///
3660/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3661///
3662/// \param __X
3663/// A pointer to the memory used for storing values.
3664/// \param __M
3665/// A 256-bit vector of [4 x i64] containing the mask bits.
3666/// \param __Y
3667/// A 256-bit vector of [4 x i64] containing the values to store.
3668static __inline__ void __DEFAULT_FN_ATTRS256
3669_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3670{
3671 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3672}
3673
3674/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3675/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3676/// the corresponding element in the mask \a __M is set; otherwise, the
3677/// memory element is unchanged.
3678///
3679/// \code{.operation}
3680/// FOR i := 0 TO 3
3681/// j := i*32
3682/// IF __M[j+31] == 1
3683/// Store32(__X+(i*4), __Y[j+31:j])
3684/// FI
3685/// ENDFOR
3686/// \endcode
3687///
3688/// \headerfile <immintrin.h>
3689///
3690/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3691///
3692/// \param __X
3693/// A pointer to the memory used for storing values.
3694/// \param __M
3695/// A 128-bit vector of [4 x i32] containing the mask bits.
3696/// \param __Y
3697/// A 128-bit vector of [4 x i32] containing the values to store.
3698static __inline__ void __DEFAULT_FN_ATTRS128
3699_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3700{
3701 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3702}
3703
3704/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3705/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3706/// the corresponding element in the mask \a __M is set; otherwise, the
3707/// memory element is unchanged.
3708///
3709/// \code{.operation}
3710/// FOR i := 0 TO 1
3711/// j := i*64
3712/// IF __M[j+63] == 1
3713/// Store64(__X+(i*8), __Y[j+63:j])
3714/// FI
3715/// ENDFOR
3716/// \endcode
3717///
3718/// \headerfile <immintrin.h>
3719///
3720/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3721///
3722/// \param __X
3723/// A pointer to the memory used for storing values.
3724/// \param __M
3725/// A 128-bit vector of [2 x i64] containing the mask bits.
3726/// \param __Y
3727/// A 128-bit vector of [2 x i64] containing the values to store.
3728static __inline__ void __DEFAULT_FN_ATTRS128
3729_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3730{
3731 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3732}
3733
3734/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3735/// left by the number of bits given in the corresponding element of the
3736/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3737/// returns the result. If the shift count for any element is greater than
3738/// 31, the result for that element is zero.
3739///
3740/// \headerfile <immintrin.h>
3741///
3742/// This intrinsic corresponds to the \c VPSLLVD instruction.
3743///
3744/// \param __X
3745/// A 256-bit vector of [8 x i32] to be shifted.
3746/// \param __Y
3747/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3748/// bits).
3749/// \returns A 256-bit vector of [8 x i32] containing the result.
3750static __inline__ __m256i __DEFAULT_FN_ATTRS256
3751_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3752{
3753 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3754}
3755
3756/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3757/// left by the number of bits given in the corresponding element of the
3758/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3759/// returns the result. If the shift count for any element is greater than
3760/// 31, the result for that element is zero.
3761///
3762/// \headerfile <immintrin.h>
3763///
3764/// This intrinsic corresponds to the \c VPSLLVD instruction.
3765///
3766/// \param __X
3767/// A 128-bit vector of [4 x i32] to be shifted.
3768/// \param __Y
3769/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3770/// bits).
3771/// \returns A 128-bit vector of [4 x i32] containing the result.
3772static __inline__ __m128i __DEFAULT_FN_ATTRS128
3773_mm_sllv_epi32(__m128i __X, __m128i __Y)
3774{
3775 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3776}
3777
3778/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3779/// left by the number of bits given in the corresponding element of the
3780/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3781/// returns the result. If the shift count for any element is greater than
3782/// 63, the result for that element is zero.
3783///
3784/// \headerfile <immintrin.h>
3785///
3786/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3787///
3788/// \param __X
3789/// A 256-bit vector of [4 x i64] to be shifted.
3790/// \param __Y
3791/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3792/// bits).
3793/// \returns A 256-bit vector of [4 x i64] containing the result.
3794static __inline__ __m256i __DEFAULT_FN_ATTRS256
3795_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3796{
3797 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3798}
3799
3800/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3801/// left by the number of bits given in the corresponding element of the
3802/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3803/// returns the result. If the shift count for any element is greater than
3804/// 63, the result for that element is zero.
3805///
3806/// \headerfile <immintrin.h>
3807///
3808/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3809///
3810/// \param __X
3811/// A 128-bit vector of [2 x i64] to be shifted.
3812/// \param __Y
3813/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3814/// bits).
3815/// \returns A 128-bit vector of [2 x i64] containing the result.
3816static __inline__ __m128i __DEFAULT_FN_ATTRS128
3817_mm_sllv_epi64(__m128i __X, __m128i __Y)
3818{
3819 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3820}
3821
3822/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3823/// right by the number of bits given in the corresponding element of the
3824/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3825/// returns the result. If the shift count for any element is greater than
3826/// 31, the result for that element is 0 or -1 according to the sign bit
3827/// for that element.
3828///
3829/// \headerfile <immintrin.h>
3830///
3831/// This intrinsic corresponds to the \c VPSRAVD instruction.
3832///
3833/// \param __X
3834/// A 256-bit vector of [8 x i32] to be shifted.
3835/// \param __Y
3836/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3837/// bits).
3838/// \returns A 256-bit vector of [8 x i32] containing the result.
3839static __inline__ __m256i __DEFAULT_FN_ATTRS256
3840_mm256_srav_epi32(__m256i __X, __m256i __Y)
3841{
3842 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3843}
3844
3845/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3846/// right by the number of bits given in the corresponding element of the
3847/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3848/// returns the result. If the shift count for any element is greater than
3849/// 31, the result for that element is 0 or -1 according to the sign bit
3850/// for that element.
3851///
3852/// \headerfile <immintrin.h>
3853///
3854/// This intrinsic corresponds to the \c VPSRAVD instruction.
3855///
3856/// \param __X
3857/// A 128-bit vector of [4 x i32] to be shifted.
3858/// \param __Y
3859/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3860/// bits).
3861/// \returns A 128-bit vector of [4 x i32] containing the result.
3862static __inline__ __m128i __DEFAULT_FN_ATTRS128
3863_mm_srav_epi32(__m128i __X, __m128i __Y)
3864{
3865 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3866}
3867
3868/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3869/// right by the number of bits given in the corresponding element of the
3870/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3871/// returns the result. If the shift count for any element is greater than
3872/// 31, the result for that element is zero.
3873///
3874/// \headerfile <immintrin.h>
3875///
3876/// This intrinsic corresponds to the \c VPSRLVD instruction.
3877///
3878/// \param __X
3879/// A 256-bit vector of [8 x i32] to be shifted.
3880/// \param __Y
3881/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3882/// bits).
3883/// \returns A 256-bit vector of [8 x i32] containing the result.
3884static __inline__ __m256i __DEFAULT_FN_ATTRS256
3885_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3886{
3887 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3888}
3889
3890/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3891/// right by the number of bits given in the corresponding element of the
3892/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3893/// returns the result. If the shift count for any element is greater than
3894/// 31, the result for that element is zero.
3895///
3896/// \headerfile <immintrin.h>
3897///
3898/// This intrinsic corresponds to the \c VPSRLVD instruction.
3899///
3900/// \param __X
3901/// A 128-bit vector of [4 x i32] to be shifted.
3902/// \param __Y
3903/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3904/// bits).
3905/// \returns A 128-bit vector of [4 x i32] containing the result.
3906static __inline__ __m128i __DEFAULT_FN_ATTRS128
3907_mm_srlv_epi32(__m128i __X, __m128i __Y)
3908{
3909 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3910}
3911
3912/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3913/// right by the number of bits given in the corresponding element of the
3914/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3915/// returns the result. If the shift count for any element is greater than
3916/// 63, the result for that element is zero.
3917///
3918/// \headerfile <immintrin.h>
3919///
3920/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3921///
3922/// \param __X
3923/// A 256-bit vector of [4 x i64] to be shifted.
3924/// \param __Y
3925/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3926/// bits).
3927/// \returns A 256-bit vector of [4 x i64] containing the result.
3928static __inline__ __m256i __DEFAULT_FN_ATTRS256
3929_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3930{
3931 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3932}
3933
3934/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3935/// right by the number of bits given in the corresponding element of the
3936/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3937/// returns the result. If the shift count for any element is greater than
3938/// 63, the result for that element is zero.
3939///
3940/// \headerfile <immintrin.h>
3941///
3942/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3943///
3944/// \param __X
3945/// A 128-bit vector of [2 x i64] to be shifted.
3946/// \param __Y
3947/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3948/// bits).
3949/// \returns A 128-bit vector of [2 x i64] containing the result.
3950static __inline__ __m128i __DEFAULT_FN_ATTRS128
3951_mm_srlv_epi64(__m128i __X, __m128i __Y)
3952{
3953 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3954}
3955
3956/// Conditionally gathers two 64-bit floating-point values, either from the
3957/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3958/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3959/// of [2 x double] in \a mask determines the source for each element.
3960///
3961/// \code{.operation}
3962/// FOR element := 0 to 1
3963/// j := element*64
3964/// k := element*32
3965/// IF mask[j+63] == 0
3966/// result[j+63:j] := a[j+63:j]
3967/// ELSE
3968/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3969/// FI
3970/// ENDFOR
3971/// \endcode
3972///
3973/// \headerfile <immintrin.h>
3974///
3975/// \code
3976/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3977/// __m128d mask, const int s);
3978/// \endcode
3979///
3980/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3981///
3982/// \param a
3983/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3984/// zero.
3985/// \param m
3986/// A pointer to the memory used for loading values.
3987/// \param i
3988/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3989/// the first two elements are used.
3990/// \param mask
3991/// A 128-bit vector of [2 x double] containing the mask. The most
3992/// significant bit of each element in the mask vector represents the mask
3993/// bits. If a mask bit is zero, the corresponding value from vector \a a
3994/// is gathered; otherwise the value is loaded from memory.
3995/// \param s
3996/// A literal constant scale factor for the indexes in \a i. Must be
3997/// 1, 2, 4, or 8.
3998/// \returns A 128-bit vector of [2 x double] containing the gathered values.
3999#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4000 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4001 (double const *)(m), \
4002 (__v4si)(__m128i)(i), \
4003 (__v2df)(__m128d)(mask), (s)))
4004
4005/// Conditionally gathers four 64-bit floating-point values, either from the
4006/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4007/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4008/// of [4 x double] in \a mask determines the source for each element.
4009///
4010/// \code{.operation}
4011/// FOR element := 0 to 3
4012/// j := element*64
4013/// k := element*32
4014/// IF mask[j+63] == 0
4015/// result[j+63:j] := a[j+63:j]
4016/// ELSE
4017/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4018/// FI
4019/// ENDFOR
4020/// \endcode
4021///
4022/// \headerfile <immintrin.h>
4023///
4024/// \code
4025/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4026/// __m256d mask, const int s);
4027/// \endcode
4028///
4029/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4030///
4031/// \param a
4032/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4033/// zero.
4034/// \param m
4035/// A pointer to the memory used for loading values.
4036/// \param i
4037/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4038/// \param mask
4039/// A 256-bit vector of [4 x double] containing the mask. The most
4040/// significant bit of each element in the mask vector represents the mask
4041/// bits. If a mask bit is zero, the corresponding value from vector \a a
4042/// is gathered; otherwise the value is loaded from memory.
4043/// \param s
4044/// A literal constant scale factor for the indexes in \a i. Must be
4045/// 1, 2, 4, or 8.
4046/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4047#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4048 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4049 (double const *)(m), \
4050 (__v4si)(__m128i)(i), \
4051 (__v4df)(__m256d)(mask), (s)))
4052
4053/// Conditionally gathers two 64-bit floating-point values, either from the
4054/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4055/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4056/// of [2 x double] in \a mask determines the source for each element.
4057///
4058/// \code{.operation}
4059/// FOR element := 0 to 1
4060/// j := element*64
4061/// k := element*64
4062/// IF mask[j+63] == 0
4063/// result[j+63:j] := a[j+63:j]
4064/// ELSE
4065/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4066/// FI
4067/// ENDFOR
4068/// \endcode
4069///
4070/// \headerfile <immintrin.h>
4071///
4072/// \code
4073/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4074/// __m128d mask, const int s);
4075/// \endcode
4076///
4077/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4078///
4079/// \param a
4080/// A 128-bit vector of [2 x double] used as the source when a mask bit is
4081/// zero.
4082/// \param m
4083/// A pointer to the memory used for loading values.
4084/// \param i
4085/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4086/// \param mask
4087/// A 128-bit vector of [2 x double] containing the mask. The most
4088/// significant bit of each element in the mask vector represents the mask
4089/// bits. If a mask bit is zero, the corresponding value from vector \a a
4090/// is gathered; otherwise the value is loaded from memory.
4091/// \param s
4092/// A literal constant scale factor for the indexes in \a i. Must be
4093/// 1, 2, 4, or 8.
4094/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4095#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4096 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4097 (double const *)(m), \
4098 (__v2di)(__m128i)(i), \
4099 (__v2df)(__m128d)(mask), (s)))
4100
4101/// Conditionally gathers four 64-bit floating-point values, either from the
4102/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4103/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4104/// of [4 x double] in \a mask determines the source for each element.
4105///
4106/// \code{.operation}
4107/// FOR element := 0 to 3
4108/// j := element*64
4109/// k := element*64
4110/// IF mask[j+63] == 0
4111/// result[j+63:j] := a[j+63:j]
4112/// ELSE
4113/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4114/// FI
4115/// ENDFOR
4116/// \endcode
4117///
4118/// \headerfile <immintrin.h>
4119///
4120/// \code
4121/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4122/// __m256d mask, const int s);
4123/// \endcode
4124///
4125/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4126///
4127/// \param a
4128/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4129/// zero.
4130/// \param m
4131/// A pointer to the memory used for loading values.
4132/// \param i
4133/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4134/// \param mask
4135/// A 256-bit vector of [4 x double] containing the mask. The most
4136/// significant bit of each element in the mask vector represents the mask
4137/// bits. If a mask bit is zero, the corresponding value from vector \a a
4138/// is gathered; otherwise the value is loaded from memory.
4139/// \param s
4140/// A literal constant scale factor for the indexes in \a i. Must be
4141/// 1, 2, 4, or 8.
4142/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4143#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4144 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4145 (double const *)(m), \
4146 (__v4di)(__m256i)(i), \
4147 (__v4df)(__m256d)(mask), (s)))
4148
4149/// Conditionally gathers four 32-bit floating-point values, either from the
4150/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4151/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4152/// of [4 x float] in \a mask determines the source for each element.
4153///
4154/// \code{.operation}
4155/// FOR element := 0 to 3
4156/// j := element*32
4157/// k := element*32
4158/// IF mask[j+31] == 0
4159/// result[j+31:j] := a[j+31:j]
4160/// ELSE
4161/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4162/// FI
4163/// ENDFOR
4164/// \endcode
4165///
4166/// \headerfile <immintrin.h>
4167///
4168/// \code
4169/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4170/// __m128 mask, const int s);
4171/// \endcode
4172///
4173/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4174///
4175/// \param a
4176/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4177/// zero.
4178/// \param m
4179/// A pointer to the memory used for loading values.
4180/// \param i
4181/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4182/// \param mask
4183/// A 128-bit vector of [4 x float] containing the mask. The most
4184/// significant bit of each element in the mask vector represents the mask
4185/// bits. If a mask bit is zero, the corresponding value from vector \a a
4186/// is gathered; otherwise the value is loaded from memory.
4187/// \param s
4188/// A literal constant scale factor for the indexes in \a i. Must be
4189/// 1, 2, 4, or 8.
4190/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4191#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4192 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4193 (float const *)(m), \
4194 (__v4si)(__m128i)(i), \
4195 (__v4sf)(__m128)(mask), (s)))
4196
4197/// Conditionally gathers eight 32-bit floating-point values, either from the
4198/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4199/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4200/// of [8 x float] in \a mask determines the source for each element.
4201///
4202/// \code{.operation}
4203/// FOR element := 0 to 7
4204/// j := element*32
4205/// k := element*32
4206/// IF mask[j+31] == 0
4207/// result[j+31:j] := a[j+31:j]
4208/// ELSE
4209/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4210/// FI
4211/// ENDFOR
4212/// \endcode
4213///
4214/// \headerfile <immintrin.h>
4215///
4216/// \code
4217/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4218/// __m256 mask, const int s);
4219/// \endcode
4220///
4221/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4222///
4223/// \param a
4224/// A 256-bit vector of [8 x float] used as the source when a mask bit is
4225/// zero.
4226/// \param m
4227/// A pointer to the memory used for loading values.
4228/// \param i
4229/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4230/// \param mask
4231/// A 256-bit vector of [8 x float] containing the mask. The most
4232/// significant bit of each element in the mask vector represents the mask
4233/// bits. If a mask bit is zero, the corresponding value from vector \a a
4234/// is gathered; otherwise the value is loaded from memory.
4235/// \param s
4236/// A literal constant scale factor for the indexes in \a i. Must be
4237/// 1, 2, 4, or 8.
4238/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4239#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4240 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4241 (float const *)(m), \
4242 (__v8si)(__m256i)(i), \
4243 (__v8sf)(__m256)(mask), (s)))
4244
4245/// Conditionally gathers two 32-bit floating-point values, either from the
4246/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4247/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4248/// of [4 x float] in \a mask determines the source for the lower two
4249/// elements. The upper two elements of the result are zeroed.
4250///
4251/// \code{.operation}
4252/// FOR element := 0 to 1
4253/// j := element*32
4254/// k := element*64
4255/// IF mask[j+31] == 0
4256/// result[j+31:j] := a[j+31:j]
4257/// ELSE
4258/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4259/// FI
4260/// ENDFOR
4261/// result[127:64] := 0
4262/// \endcode
4263///
4264/// \headerfile <immintrin.h>
4265///
4266/// \code
4267/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4268/// __m128 mask, const int s);
4269/// \endcode
4270///
4271/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4272///
4273/// \param a
4274/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4275/// zero. Only the first two elements are used.
4276/// \param m
4277/// A pointer to the memory used for loading values.
4278/// \param i
4279/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4280/// \param mask
4281/// A 128-bit vector of [4 x float] containing the mask. The most
4282/// significant bit of each element in the mask vector represents the mask
4283/// bits. If a mask bit is zero, the corresponding value from vector \a a
4284/// is gathered; otherwise the value is loaded from memory. Only the first
4285/// two elements are used.
4286/// \param s
4287/// A literal constant scale factor for the indexes in \a i. Must be
4288/// 1, 2, 4, or 8.
4289/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4290#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4291 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4292 (float const *)(m), \
4293 (__v2di)(__m128i)(i), \
4294 (__v4sf)(__m128)(mask), (s)))
4295
4296/// Conditionally gathers four 32-bit floating-point values, either from the
4297/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4298/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4299/// of [4 x float] in \a mask determines the source for each element.
4300///
4301/// \code{.operation}
4302/// FOR element := 0 to 3
4303/// j := element*32
4304/// k := element*64
4305/// IF mask[j+31] == 0
4306/// result[j+31:j] := a[j+31:j]
4307/// ELSE
4308/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4309/// FI
4310/// ENDFOR
4311/// \endcode
4312///
4313/// \headerfile <immintrin.h>
4314///
4315/// \code
4316/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4317/// __m128 mask, const int s);
4318/// \endcode
4319///
4320/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4321///
4322/// \param a
4323/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4324/// zero.
4325/// \param m
4326/// A pointer to the memory used for loading values.
4327/// \param i
4328/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4329/// \param mask
4330/// A 128-bit vector of [4 x float] containing the mask. The most
4331/// significant bit of each element in the mask vector represents the mask
4332/// bits. If a mask bit is zero, the corresponding value from vector \a a
4333/// is gathered; otherwise the value is loaded from memory.
4334/// \param s
4335/// A literal constant scale factor for the indexes in \a i. Must be
4336/// 1, 2, 4, or 8.
4337/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4338#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4339 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4340 (float const *)(m), \
4341 (__v4di)(__m256i)(i), \
4342 (__v4sf)(__m128)(mask), (s)))
4343
4344/// Conditionally gathers four 32-bit integer values, either from the
4345/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4346/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4347/// of [4 x i32] in \a mask determines the source for each element.
4348///
4349/// \code{.operation}
4350/// FOR element := 0 to 3
4351/// j := element*32
4352/// k := element*32
4353/// IF mask[j+31] == 0
4354/// result[j+31:j] := a[j+31:j]
4355/// ELSE
4356/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4357/// FI
4358/// ENDFOR
4359/// \endcode
4360///
4361/// \headerfile <immintrin.h>
4362///
4363/// \code
4364/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4365/// __m128i mask, const int s);
4366/// \endcode
4367///
4368/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4369///
4370/// \param a
4371/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4372/// zero.
4373/// \param m
4374/// A pointer to the memory used for loading values.
4375/// \param i
4376/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4377/// \param mask
4378/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4379/// bit of each element in the mask vector represents the mask bits. If a
4380/// mask bit is zero, the corresponding value from vector \a a is gathered;
4381/// otherwise the value is loaded from memory.
4382/// \param s
4383/// A literal constant scale factor for the indexes in \a i. Must be
4384/// 1, 2, 4, or 8.
4385/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4386#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4387 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4388 (int const *)(m), \
4389 (__v4si)(__m128i)(i), \
4390 (__v4si)(__m128i)(mask), (s)))
4391
4392/// Conditionally gathers eight 32-bit integer values, either from the
4393/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4394/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4395/// of [8 x i32] in \a mask determines the source for each element.
4396///
4397/// \code{.operation}
4398/// FOR element := 0 to 7
4399/// j := element*32
4400/// k := element*32
4401/// IF mask[j+31] == 0
4402/// result[j+31:j] := a[j+31:j]
4403/// ELSE
4404/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4405/// FI
4406/// ENDFOR
4407/// \endcode
4408///
4409/// \headerfile <immintrin.h>
4410///
4411/// \code
4412/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4413/// __m256i mask, const int s);
4414/// \endcode
4415///
4416/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4417///
4418/// \param a
4419/// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4420/// zero.
4421/// \param m
4422/// A pointer to the memory used for loading values.
4423/// \param i
4424/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4425/// \param mask
4426/// A 256-bit vector of [8 x i32] containing the mask. The most significant
4427/// bit of each element in the mask vector represents the mask bits. If a
4428/// mask bit is zero, the corresponding value from vector \a a is gathered;
4429/// otherwise the value is loaded from memory.
4430/// \param s
4431/// A literal constant scale factor for the indexes in \a i. Must be
4432/// 1, 2, 4, or 8.
4433/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4434#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4435 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4436 (int const *)(m), \
4437 (__v8si)(__m256i)(i), \
4438 (__v8si)(__m256i)(mask), (s)))
4439
4440/// Conditionally gathers two 32-bit integer values, either from the
4441/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4442/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4443/// of [4 x i32] in \a mask determines the source for the lower two
4444/// elements. The upper two elements of the result are zeroed.
4445///
4446/// \code{.operation}
4447/// FOR element := 0 to 1
4448/// j := element*32
4449/// k := element*64
4450/// IF mask[j+31] == 0
4451/// result[j+31:j] := a[j+31:j]
4452/// ELSE
4453/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4454/// FI
4455/// ENDFOR
4456/// result[127:64] := 0
4457/// \endcode
4458///
4459/// \headerfile <immintrin.h>
4460///
4461/// \code
4462/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4463/// __m128i mask, const int s);
4464/// \endcode
4465///
4466/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4467///
4468/// \param a
4469/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4470/// zero. Only the first two elements are used.
4471/// \param m
4472/// A pointer to the memory used for loading values.
4473/// \param i
4474/// A 128-bit vector of [2 x i64] containing indexes into \a m.
4475/// \param mask
4476/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4477/// bit of each element in the mask vector represents the mask bits. If a
4478/// mask bit is zero, the corresponding value from vector \a a is gathered;
4479/// otherwise the value is loaded from memory. Only the first two elements
4480/// are used.
4481/// \param s
4482/// A literal constant scale factor for the indexes in \a i. Must be
4483/// 1, 2, 4, or 8.
4484/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4485#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4486 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4487 (int const *)(m), \
4488 (__v2di)(__m128i)(i), \
4489 (__v4si)(__m128i)(mask), (s)))
4490
4491/// Conditionally gathers four 32-bit integer values, either from the
4492/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4493/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4494/// of [4 x i32] in \a mask determines the source for each element.
4495///
4496/// \code{.operation}
4497/// FOR element := 0 to 3
4498/// j := element*32
4499/// k := element*64
4500/// IF mask[j+31] == 0
4501/// result[j+31:j] := a[j+31:j]
4502/// ELSE
4503/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4504/// FI
4505/// ENDFOR
4506/// \endcode
4507///
4508/// \headerfile <immintrin.h>
4509///
4510/// \code
4511/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4512/// __m128i mask, const int s);
4513/// \endcode
4514///
4515/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4516///
4517/// \param a
4518/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4519/// zero.
4520/// \param m
4521/// A pointer to the memory used for loading values.
4522/// \param i
4523/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4524/// \param mask
4525/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4526/// bit of each element in the mask vector represents the mask bits. If a
4527/// mask bit is zero, the corresponding value from vector \a a is gathered;
4528/// otherwise the value is loaded from memory.
4529/// \param s
4530/// A literal constant scale factor for the indexes in \a i. Must be
4531/// 1, 2, 4, or 8.
4532/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4533#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4534 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4535 (int const *)(m), \
4536 (__v4di)(__m256i)(i), \
4537 (__v4si)(__m128i)(mask), (s)))
4538
4539/// Conditionally gathers two 64-bit integer values, either from the
4540/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4541/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4542/// of [2 x i64] in \a mask determines the source for each element.
4543///
4544/// \code{.operation}
4545/// FOR element := 0 to 1
4546/// j := element*64
4547/// k := element*32
4548/// IF mask[j+63] == 0
4549/// result[j+63:j] := a[j+63:j]
4550/// ELSE
4551/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4552/// FI
4553/// ENDFOR
4554/// \endcode
4555///
4556/// \headerfile <immintrin.h>
4557///
4558/// \code
4559/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4560/// __m128i mask, const int s);
4561/// \endcode
4562///
4563/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4564///
4565/// \param a
4566/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4567/// zero.
4568/// \param m
4569/// A pointer to the memory used for loading values.
4570/// \param i
4571/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4572/// the first two elements are used.
4573/// \param mask
4574/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4575/// bit of each element in the mask vector represents the mask bits. If a
4576/// mask bit is zero, the corresponding value from vector \a a is gathered;
4577/// otherwise the value is loaded from memory.
4578/// \param s
4579/// A literal constant scale factor for the indexes in \a i. Must be
4580/// 1, 2, 4, or 8.
4581/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4582#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4583 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4584 (long long const *)(m), \
4585 (__v4si)(__m128i)(i), \
4586 (__v2di)(__m128i)(mask), (s)))
4587
4588/// Conditionally gathers four 64-bit integer values, either from the
4589/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4590/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4591/// of [4 x i64] in \a mask determines the source for each element.
4592///
4593/// \code{.operation}
4594/// FOR element := 0 to 3
4595/// j := element*64
4596/// k := element*32
4597/// IF mask[j+63] == 0
4598/// result[j+63:j] := a[j+63:j]
4599/// ELSE
4600/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4601/// FI
4602/// ENDFOR
4603/// \endcode
4604///
4605/// \headerfile <immintrin.h>
4606///
4607/// \code
4608/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4609/// __m128i i, __m256i mask, const int s);
4610/// \endcode
4611///
4612/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4613///
4614/// \param a
4615/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4616/// zero.
4617/// \param m
4618/// A pointer to the memory used for loading values.
4619/// \param i
4620/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4621/// \param mask
4622/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4623/// bit of each element in the mask vector represents the mask bits. If a
4624/// mask bit is zero, the corresponding value from vector \a a is gathered;
4625/// otherwise the value is loaded from memory.
4626/// \param s
4627/// A literal constant scale factor for the indexes in \a i. Must be
4628/// 1, 2, 4, or 8.
4629/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4630#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4631 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4632 (long long const *)(m), \
4633 (__v4si)(__m128i)(i), \
4634 (__v4di)(__m256i)(mask), (s)))
4635
4636/// Conditionally gathers two 64-bit integer values, either from the
4637/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4638/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4639/// of [2 x i64] in \a mask determines the source for each element.
4640///
4641/// \code{.operation}
4642/// FOR element := 0 to 1
4643/// j := element*64
4644/// k := element*64
4645/// IF mask[j+63] == 0
4646/// result[j+63:j] := a[j+63:j]
4647/// ELSE
4648/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4649/// FI
4650/// ENDFOR
4651/// \endcode
4652///
4653/// \headerfile <immintrin.h>
4654///
4655/// \code
4656/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4657/// __m128i mask, const int s);
4658/// \endcode
4659///
4660/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4661///
4662/// \param a
4663/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4664/// zero.
4665/// \param m
4666/// A pointer to the memory used for loading values.
4667/// \param i
4668/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4669/// \param mask
4670/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4671/// bit of each element in the mask vector represents the mask bits. If a
4672/// mask bit is zero, the corresponding value from vector \a a is gathered;
4673/// otherwise the value is loaded from memory.
4674/// \param s
4675/// A literal constant scale factor for the indexes in \a i. Must be
4676/// 1, 2, 4, or 8.
4677/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4678#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4679 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4680 (long long const *)(m), \
4681 (__v2di)(__m128i)(i), \
4682 (__v2di)(__m128i)(mask), (s)))
4683
4684/// Conditionally gathers four 64-bit integer values, either from the
4685/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4686/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4687/// of [4 x i64] in \a mask determines the source for each element.
4688///
4689/// \code{.operation}
4690/// FOR element := 0 to 3
4691/// j := element*64
4692/// k := element*64
4693/// IF mask[j+63] == 0
4694/// result[j+63:j] := a[j+63:j]
4695/// ELSE
4696/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4697/// FI
4698/// ENDFOR
4699/// \endcode
4700///
4701/// \headerfile <immintrin.h>
4702///
4703/// \code
4704/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4705/// __m256i i, __m256i mask, const int s);
4706/// \endcode
4707///
4708/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4709///
4710/// \param a
4711/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4712/// zero.
4713/// \param m
4714/// A pointer to the memory used for loading values.
4715/// \param i
4716/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4717/// \param mask
4718/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4719/// bit of each element in the mask vector represents the mask bits. If a
4720/// mask bit is zero, the corresponding value from vector \a a is gathered;
4721/// otherwise the value is loaded from memory.
4722/// \param s
4723/// A literal constant scale factor for the indexes in \a i. Must be
4724/// 1, 2, 4, or 8.
4725/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4726#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4727 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4728 (long long const *)(m), \
4729 (__v4di)(__m256i)(i), \
4730 (__v4di)(__m256i)(mask), (s)))
4731
4732/// Gathers two 64-bit floating-point values from memory \a m using scaled
4733/// indexes from the 128-bit vector of [4 x i32] in \a i.
4734///
4735/// \code{.operation}
4736/// FOR element := 0 to 1
4737/// j := element*64
4738/// k := element*32
4739/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4740/// ENDFOR
4741/// \endcode
4742///
4743/// \headerfile <immintrin.h>
4744///
4745/// \code
4746/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4747/// \endcode
4748///
4749/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4750///
4751/// \param m
4752/// A pointer to the memory used for loading values.
4753/// \param i
4754/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4755/// the first two elements are used.
4756/// \param s
4757/// A literal constant scale factor for the indexes in \a i. Must be
4758/// 1, 2, 4, or 8.
4759/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4760#define _mm_i32gather_pd(m, i, s) \
4761 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4762 (double const *)(m), \
4763 (__v4si)(__m128i)(i), \
4764 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4765 _mm_setzero_pd()), \
4766 (s)))
4767
4768/// Gathers four 64-bit floating-point values from memory \a m using scaled
4769/// indexes from the 128-bit vector of [4 x i32] in \a i.
4770///
4771/// \code{.operation}
4772/// FOR element := 0 to 3
4773/// j := element*64
4774/// k := element*32
4775/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4776/// ENDFOR
4777/// \endcode
4778///
4779/// \headerfile <immintrin.h>
4780///
4781/// \code
4782/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4783/// \endcode
4784///
4785/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4786///
4787/// \param m
4788/// A pointer to the memory used for loading values.
4789/// \param i
4790/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4791/// \param s
4792/// A literal constant scale factor for the indexes in \a i. Must be
4793/// 1, 2, 4, or 8.
4794/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4795#define _mm256_i32gather_pd(m, i, s) \
4796 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4797 (double const *)(m), \
4798 (__v4si)(__m128i)(i), \
4799 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4800 _mm256_setzero_pd(), \
4801 _CMP_EQ_OQ), \
4802 (s)))
4803
4804/// Gathers two 64-bit floating-point values from memory \a m using scaled
4805/// indexes from the 128-bit vector of [2 x i64] in \a i.
4806///
4807/// \code{.operation}
4808/// FOR element := 0 to 1
4809/// j := element*64
4810/// k := element*64
4811/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4812/// ENDFOR
4813/// \endcode
4814///
4815/// \headerfile <immintrin.h>
4816///
4817/// \code
4818/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4819/// \endcode
4820///
4821/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4822///
4823/// \param m
4824/// A pointer to the memory used for loading values.
4825/// \param i
4826/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4827/// \param s
4828/// A literal constant scale factor for the indexes in \a i. Must be
4829/// 1, 2, 4, or 8.
4830/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4831#define _mm_i64gather_pd(m, i, s) \
4832 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4833 (double const *)(m), \
4834 (__v2di)(__m128i)(i), \
4835 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4836 _mm_setzero_pd()), \
4837 (s)))
4838
4839/// Gathers four 64-bit floating-point values from memory \a m using scaled
4840/// indexes from the 256-bit vector of [4 x i64] in \a i.
4841///
4842/// \code{.operation}
4843/// FOR element := 0 to 3
4844/// j := element*64
4845/// k := element*64
4846/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4847/// ENDFOR
4848/// \endcode
4849///
4850/// \headerfile <immintrin.h>
4851///
4852/// \code
4853/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4854/// \endcode
4855///
4856/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4857///
4858/// \param m
4859/// A pointer to the memory used for loading values.
4860/// \param i
4861/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4862/// \param s
4863/// A literal constant scale factor for the indexes in \a i. Must be
4864/// 1, 2, 4, or 8.
4865/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4866#define _mm256_i64gather_pd(m, i, s) \
4867 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4868 (double const *)(m), \
4869 (__v4di)(__m256i)(i), \
4870 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4871 _mm256_setzero_pd(), \
4872 _CMP_EQ_OQ), \
4873 (s)))
4874
4875/// Gathers four 32-bit floating-point values from memory \a m using scaled
4876/// indexes from the 128-bit vector of [4 x i32] in \a i.
4877///
4878/// \code{.operation}
4879/// FOR element := 0 to 3
4880/// j := element*32
4881/// k := element*32
4882/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4883/// ENDFOR
4884/// \endcode
4885///
4886/// \headerfile <immintrin.h>
4887///
4888/// \code
4889/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4890/// \endcode
4891///
4892/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4893///
4894/// \param m
4895/// A pointer to the memory used for loading values.
4896/// \param i
4897/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4898/// \param s
4899/// A literal constant scale factor for the indexes in \a i. Must be
4900/// 1, 2, 4, or 8.
4901/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4902#define _mm_i32gather_ps(m, i, s) \
4903 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4904 (float const *)(m), \
4905 (__v4si)(__m128i)(i), \
4906 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4907 _mm_setzero_ps()), \
4908 (s)))
4909
4910/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4911/// indexes from the 256-bit vector of [8 x i32] in \a i.
4912///
4913/// \code{.operation}
4914/// FOR element := 0 to 7
4915/// j := element*32
4916/// k := element*32
4917/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4918/// ENDFOR
4919/// \endcode
4920///
4921/// \headerfile <immintrin.h>
4922///
4923/// \code
4924/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4925/// \endcode
4926///
4927/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4928///
4929/// \param m
4930/// A pointer to the memory used for loading values.
4931/// \param i
4932/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4933/// \param s
4934/// A literal constant scale factor for the indexes in \a i. Must be
4935/// 1, 2, 4, or 8.
4936/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4937#define _mm256_i32gather_ps(m, i, s) \
4938 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4939 (float const *)(m), \
4940 (__v8si)(__m256i)(i), \
4941 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4942 _mm256_setzero_ps(), \
4943 _CMP_EQ_OQ), \
4944 (s)))
4945
4946/// Gathers two 32-bit floating-point values from memory \a m using scaled
4947/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4948/// elements of the result are zeroed.
4949///
4950/// \code{.operation}
4951/// FOR element := 0 to 1
4952/// j := element*32
4953/// k := element*64
4954/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4955/// ENDFOR
4956/// result[127:64] := 0
4957/// \endcode
4958///
4959/// \headerfile <immintrin.h>
4960///
4961/// \code
4962/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4963/// \endcode
4964///
4965/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4966///
4967/// \param m
4968/// A pointer to the memory used for loading values.
4969/// \param i
4970/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4971/// \param s
4972/// A literal constant scale factor for the indexes in \a i. Must be
4973/// 1, 2, 4, or 8.
4974/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4975#define _mm_i64gather_ps(m, i, s) \
4976 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4977 (float const *)(m), \
4978 (__v2di)(__m128i)(i), \
4979 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4980 _mm_setzero_ps()), \
4981 (s)))
4982
4983/// Gathers four 32-bit floating-point values from memory \a m using scaled
4984/// indexes from the 256-bit vector of [4 x i64] in \a i.
4985///
4986/// \code{.operation}
4987/// FOR element := 0 to 3
4988/// j := element*32
4989/// k := element*64
4990/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4991/// ENDFOR
4992/// \endcode
4993///
4994/// \headerfile <immintrin.h>
4995///
4996/// \code
4997/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4998/// \endcode
4999///
5000/// This intrinsic corresponds to the \c VGATHERQPS instruction.
5001///
5002/// \param m
5003/// A pointer to the memory used for loading values.
5004/// \param i
5005/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5006/// \param s
5007/// A literal constant scale factor for the indexes in \a i. Must be
5008/// 1, 2, 4, or 8.
5009/// \returns A 128-bit vector of [4 x float] containing the gathered values.
5010#define _mm256_i64gather_ps(m, i, s) \
5011 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5012 (float const *)(m), \
5013 (__v4di)(__m256i)(i), \
5014 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5015 _mm_setzero_ps()), \
5016 (s)))
5017
5018/// Gathers four 32-bit floating-point values from memory \a m using scaled
5019/// indexes from the 128-bit vector of [4 x i32] in \a i.
5020///
5021/// \code{.operation}
5022/// FOR element := 0 to 3
5023/// j := element*32
5024/// k := element*32
5025/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5026/// ENDFOR
5027/// \endcode
5028///
5029/// \headerfile <immintrin.h>
5030///
5031/// \code
5032/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5033/// \endcode
5034///
5035/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5036///
5037/// \param m
5038/// A pointer to the memory used for loading values.
5039/// \param i
5040/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5041/// \param s
5042/// A literal constant scale factor for the indexes in \a i. Must be
5043/// 1, 2, 4, or 8.
5044/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5045#define _mm_i32gather_epi32(m, i, s) \
5046 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5047 (int const *)(m), (__v4si)(__m128i)(i), \
5048 (__v4si)_mm_set1_epi32(-1), (s)))
5049
5050/// Gathers eight 32-bit floating-point values from memory \a m using scaled
5051/// indexes from the 256-bit vector of [8 x i32] in \a i.
5052///
5053/// \code{.operation}
5054/// FOR element := 0 to 7
5055/// j := element*32
5056/// k := element*32
5057/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5058/// ENDFOR
5059/// \endcode
5060///
5061/// \headerfile <immintrin.h>
5062///
5063/// \code
5064/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5065/// \endcode
5066///
5067/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5068///
5069/// \param m
5070/// A pointer to the memory used for loading values.
5071/// \param i
5072/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5073/// \param s
5074/// A literal constant scale factor for the indexes in \a i. Must be
5075/// 1, 2, 4, or 8.
5076/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5077#define _mm256_i32gather_epi32(m, i, s) \
5078 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5079 (int const *)(m), (__v8si)(__m256i)(i), \
5080 (__v8si)_mm256_set1_epi32(-1), (s)))
5081
5082/// Gathers two 32-bit integer values from memory \a m using scaled indexes
5083/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5084/// of the result are zeroed.
5085///
5086/// \code{.operation}
5087/// FOR element := 0 to 1
5088/// j := element*32
5089/// k := element*64
5090/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5091/// ENDFOR
5092/// result[127:64] := 0
5093/// \endcode
5094///
5095/// \headerfile <immintrin.h>
5096///
5097/// \code
5098/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5099/// \endcode
5100///
5101/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5102///
5103/// \param m
5104/// A pointer to the memory used for loading values.
5105/// \param i
5106/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5107/// \param s
5108/// A literal constant scale factor for the indexes in \a i. Must be
5109/// 1, 2, 4, or 8.
5110/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5111#define _mm_i64gather_epi32(m, i, s) \
5112 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5113 (int const *)(m), (__v2di)(__m128i)(i), \
5114 (__v4si)_mm_set1_epi32(-1), (s)))
5115
5116/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5117/// from the 256-bit vector of [4 x i64] in \a i.
5118///
5119/// \code{.operation}
5120/// FOR element := 0 to 3
5121/// j := element*32
5122/// k := element*64
5123/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5124/// ENDFOR
5125/// \endcode
5126///
5127/// \headerfile <immintrin.h>
5128///
5129/// \code
5130/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5131/// \endcode
5132///
5133/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5134///
5135/// \param m
5136/// A pointer to the memory used for loading values.
5137/// \param i
5138/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5139/// \param s
5140/// A literal constant scale factor for the indexes in \a i. Must be
5141/// 1, 2, 4, or 8.
5142/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5143#define _mm256_i64gather_epi32(m, i, s) \
5144 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5145 (int const *)(m), (__v4di)(__m256i)(i), \
5146 (__v4si)_mm_set1_epi32(-1), (s)))
5147
5148/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5149/// from the 128-bit vector of [4 x i32] in \a i.
5150///
5151/// \code{.operation}
5152/// FOR element := 0 to 1
5153/// j := element*64
5154/// k := element*32
5155/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5156/// ENDFOR
5157/// \endcode
5158///
5159/// \headerfile <immintrin.h>
5160///
5161/// \code
5162/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5163/// \endcode
5164///
5165/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5166///
5167/// \param m
5168/// A pointer to the memory used for loading values.
5169/// \param i
5170/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5171/// the first two elements are used.
5172/// \param s
5173/// A literal constant scale factor for the indexes in \a i. Must be
5174/// 1, 2, 4, or 8.
5175/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5176#define _mm_i32gather_epi64(m, i, s) \
5177 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5178 (long long const *)(m), \
5179 (__v4si)(__m128i)(i), \
5180 (__v2di)_mm_set1_epi64x(-1), (s)))
5181
5182/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5183/// from the 128-bit vector of [4 x i32] in \a i.
5184///
5185/// \code{.operation}
5186/// FOR element := 0 to 3
5187/// j := element*64
5188/// k := element*32
5189/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5190/// ENDFOR
5191/// \endcode
5192///
5193/// \headerfile <immintrin.h>
5194///
5195/// \code
5196/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5197/// \endcode
5198///
5199/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5200///
5201/// \param m
5202/// A pointer to the memory used for loading values.
5203/// \param i
5204/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5205/// \param s
5206/// A literal constant scale factor for the indexes in \a i. Must be
5207/// 1, 2, 4, or 8.
5208/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5209#define _mm256_i32gather_epi64(m, i, s) \
5210 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5211 (long long const *)(m), \
5212 (__v4si)(__m128i)(i), \
5213 (__v4di)_mm256_set1_epi64x(-1), (s)))
5214
5215/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5216/// from the 128-bit vector of [2 x i64] in \a i.
5217///
5218/// \code{.operation}
5219/// FOR element := 0 to 1
5220/// j := element*64
5221/// k := element*64
5222/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5223/// ENDFOR
5224/// \endcode
5225///
5226/// \headerfile <immintrin.h>
5227///
5228/// \code
5229/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5230/// \endcode
5231///
5232/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5233///
5234/// \param m
5235/// A pointer to the memory used for loading values.
5236/// \param i
5237/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5238/// \param s
5239/// A literal constant scale factor for the indexes in \a i. Must be
5240/// 1, 2, 4, or 8.
5241/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5242#define _mm_i64gather_epi64(m, i, s) \
5243 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5244 (long long const *)(m), \
5245 (__v2di)(__m128i)(i), \
5246 (__v2di)_mm_set1_epi64x(-1), (s)))
5247
5248/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5249/// from the 256-bit vector of [4 x i64] in \a i.
5250///
5251/// \code{.operation}
5252/// FOR element := 0 to 3
5253/// j := element*64
5254/// k := element*64
5255/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5256/// ENDFOR
5257/// \endcode
5258///
5259/// \headerfile <immintrin.h>
5260///
5261/// \code
5262/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5263/// \endcode
5264///
5265/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5266///
5267/// \param m
5268/// A pointer to the memory used for loading values.
5269/// \param i
5270/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5271/// \param s
5272/// A literal constant scale factor for the indexes in \a i. Must be
5273/// 1, 2, 4, or 8.
5274/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5275#define _mm256_i64gather_epi64(m, i, s) \
5276 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5277 (long long const *)(m), \
5278 (__v4di)(__m256i)(i), \
5279 (__v4di)_mm256_set1_epi64x(-1), (s)))
5280
5281#undef __DEFAULT_FN_ATTRS256
5282#undef __DEFAULT_FN_ATTRS128
5283
5284#endif /* __AVX2INTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
Definition: avx2intrin.h:2434
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits,...
Definition: avx2intrin.h:2453
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi8(__m256i __a)
Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each va...
Definition: avx2intrin.h:99
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
Definition: avx2intrin.h:2394
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi32(__m128i __V)
Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
Definition: avx2intrin.h:1543
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and re...
Definition: avx2intrin.h:848
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memo...
Definition: avx2intrin.h:3669
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_and_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vectors in __a and __b.
Definition: avx2intrin.h:455
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi32(__m128i __V)
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
Definition: avx2intrin.h:1441
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256(const void *__V)
Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vec...
Definition: avx2intrin.h:2986
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
Definition: avx2intrin.h:1176
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given...
Definition: avx2intrin.h:2190
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
Definition: avx2intrin.h:2951
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
Definition: avx2intrin.h:3259
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given...
Definition: avx2intrin.h:2230
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu16(__m256i __a, __m256i __b)
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a a...
Definition: avx2intrin.h:1290
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi32(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation,...
Definition: avx2intrin.h:196
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64(long long const *__X, __m256i __M)
Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the ...
Definition: avx2intrin.h:3545
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi32(__m128i __V)
Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit va...
Definition: avx2intrin.h:1386
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi16(__m128i __V)
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
Definition: avx2intrin.h:1358
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to mem...
Definition: avx2intrin.h:3639
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns t...
Definition: avx2intrin.h:628
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits,...
Definition: avx2intrin.h:2169
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256...
Definition: avx2intrin.h:1901
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and...
Definition: avx2intrin.h:1670
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32(int const *__X, __m256i __M)
Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the...
Definition: avx2intrin.h:3513
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
Definition: avx2intrin.h:2292
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.
Definition: avx2intrin.h:3227
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi16(__m128i __V)
Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
Definition: avx2intrin.h:1517
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned satur...
Definition: avx2intrin.h:2659
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
Definition: avx2intrin.h:1214
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi32(__m256i __a, __m256i __b)
Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower ...
Definition: avx2intrin.h:1773
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].
Definition: avx2intrin.h:2528
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32...
Definition: avx2intrin.h:3303
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than an...
Definition: avx2intrin.h:732
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi8(__m256i __a, __m256i __b)
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns...
Definition: avx2intrin.h:1100
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi64(__m256i __a, __m256i __b)
Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].
Definition: avx2intrin.h:2580
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given...
Definition: avx2intrin.h:3795
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater...
Definition: avx2intrin.h:786
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed by...
Definition: avx2intrin.h:1049
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
Definition: avx2intrin.h:2719
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and r...
Definition: avx2intrin.h:951
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
Definition: avx2intrin.h:2921
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned ...
Definition: avx2intrin.h:409
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastsi128_si256(__m128i __X)
Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result...
Definition: avx2intrin.h:3071
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
Definition: avx2intrin.h:1195
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit ...
Definition: avx2intrin.h:1697
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater...
Definition: avx2intrin.h:812
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits spec...
Definition: avx2intrin.h:2150
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memor...
Definition: avx2intrin.h:3729
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srav_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
Definition: avx2intrin.h:3840
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 b...
Definition: avx2intrin.h:278
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_broadcastsd_pd(__m128d __X)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
Definition: avx2intrin.h:3055
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the...
Definition: avx2intrin.h:297
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x f...
Definition: avx2intrin.h:3361
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_xor_si256(__m256i __a, __m256i __b)
Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.
Definition: avx2intrin.h:2969
static __inline__ int __DEFAULT_FN_ATTRS256 _mm256_movemask_epi8(__m256i __a)
Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vecto...
Definition: avx2intrin.h:1332
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturat...
Definition: avx2intrin.h:2606
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
Definition: avx2intrin.h:3907
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits give...
Definition: avx2intrin.h:3951
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
Definition: avx2intrin.h:1157
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the ...
Definition: avx2intrin.h:560
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi64(__m256i __a, __m256i __b)
Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the ...
Definition: avx2intrin.h:335
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __...
Definition: avx2intrin.h:2070
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32(int const *__X, __m128i __M)
Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the ...
Definition: avx2intrin.h:3577
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given...
Definition: avx2intrin.h:3773
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
Definition: avx2intrin.h:1233
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadds_epi16(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using ...
Definition: avx2intrin.h:915
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi16(__m256i __a, __m256i __b)
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and...
Definition: avx2intrin.h:1119
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b...
Definition: avx2intrin.h:2788
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
Definition: avx2intrin.h:1138
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given...
Definition: avx2intrin.h:3751
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and re...
Definition: avx2intrin.h:983
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastb_epi8(__m128i __X)
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.
Definition: avx2intrin.h:3163
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
Definition: avx2intrin.h:3004
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
Definition: avx2intrin.h:2754
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits give...
Definition: avx2intrin.h:3929
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a, __m256i __b)
Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a,...
Definition: avx2intrin.h:2028
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper...
Definition: avx2intrin.h:1735
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greate...
Definition: avx2intrin.h:760
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu16_epi32(__m128i __V)
Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
Definition: avx2intrin.h:1594
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and r...
Definition: avx2intrin.h:706
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi16(__m256i __a, __m256i __b)
Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation,...
Definition: avx2intrin.h:164
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using...
Definition: avx2intrin.h:1019
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
Definition: avx2intrin.h:3243
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and ...
Definition: avx2intrin.h:654
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memo...
Definition: avx2intrin.h:3699
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi8(__m256i __a, __m256i __b)
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.
Definition: avx2intrin.h:2501
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upp...
Definition: avx2intrin.h:1716
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits,...
Definition: avx2intrin.h:2129
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi32(__m256i __a, __m128i __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits give...
Definition: avx2intrin.h:2314
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srav_epi32(__m128i __X, __m128i __Y)
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits give...
Definition: avx2intrin.h:3863
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi32(__m256i __a, __m256i __b)
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and ...
Definition: avx2intrin.h:1252
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi64(__m128i __V)
Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
Definition: avx2intrin.h:1568
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16(__m256i __a, __m256i __b)
Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit interme...
Definition: avx2intrin.h:1081
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_broadcastsd_pd(__m128d __a)
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double]...
Definition: avx2intrin.h:3021
#define __DEFAULT_FN_ATTRS256
Definition: avx2intrin.h:18
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits give...
Definition: avx2intrin.h:3885
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16...
Definition: avx2intrin.h:525
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu8(__m256i __a, __m256i __b)
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and retur...
Definition: avx2intrin.h:1271
#define __DEFAULT_FN_ATTRS128
Definition: avx2intrin.h:21
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation,...
Definition: avx2intrin.h:259
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to fo...
Definition: avx2intrin.h:2852
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using sign...
Definition: avx2intrin.h:2632
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and r...
Definition: avx2intrin.h:680
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu32(__m256i __a, __m256i __b)
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a an...
Definition: avx2intrin.h:1309
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi32(__m256i __a, __m256i __b)
Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].
Definition: avx2intrin.h:2554
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu16_epi64(__m128i __V)
Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
Definition: avx2intrin.h:1619
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastw_epi16(__m128i __X)
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result'...
Definition: avx2intrin.h:3179
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi16(__m256i __a, __m256i __b)
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower...
Definition: avx2intrin.h:1754
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
Definition: avx2intrin.h:2250
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __...
Definition: avx2intrin.h:2887
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi32_epi64(__m128i __V)
Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
Definition: avx2intrin.h:1491
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation...
Definition: avx2intrin.h:391
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b...
Definition: avx2intrin.h:2818
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi16(__m256i __a, __m256i __b)
Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation,...
Definition: avx2intrin.h:227
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi8(__m256i __a, __m256i __b)
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation,...
Definition: avx2intrin.h:354
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result...
Definition: avx2intrin.h:3275
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi64(__m256i __a, __m128i __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits give...
Definition: avx2intrin.h:2474
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits,...
Definition: avx2intrin.h:2413
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_andnot_si256(__m256i __a, __m256i __b)
Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit int...
Definition: avx2intrin.h:473
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi64(__m256i __a, int __count)
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits,...
Definition: avx2intrin.h:2209
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu32_epi64(__m128i __V)
Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit value...
Definition: avx2intrin.h:1644
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi16(__m256i __a, __m128i __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits giv...
Definition: avx2intrin.h:2272
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastd_epi32(__m128i __X)
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result'...
Definition: avx2intrin.h:3195
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi64(__m128i __V)
Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and return...
Definition: avx2intrin.h:1466
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64(long long const *__X, __m128i __M)
Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the c...
Definition: avx2intrin.h:3609
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi32(__m256i __a)
Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a a...
Definition: avx2intrin.h:133
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi64(__m128i __V)
Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit value...
Definition: avx2intrin.h:1413
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16(__m256i __a, __m256i __b)
Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in _...
Definition: avx2intrin.h:2049
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32(__m256i __a, __m256i __b)
Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and ret...
Definition: avx2intrin.h:880
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epu32(__m256i __a, __m256i __b)
Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] an...
Definition: avx2intrin.h:1799
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi64(__m128i __X, __m128i __Y)
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given...
Definition: avx2intrin.h:3817
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a, __m256i __b)
Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers fr...
Definition: avx2intrin.h:1862
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi16(__m256i __a)
Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a ...
Definition: avx2intrin.h:116
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi16(__m256i __a, __m256i __b)
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed sa...
Definition: avx2intrin.h:372
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_broadcastss_ps(__m128 __X)
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] ...
Definition: avx2intrin.h:3038
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8(__m256i __a, __m256i __b)
Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a a...
Definition: avx2intrin.h:499
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi32(__m256i __a, __m256i __b)
Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the ...
Definition: avx2intrin.h:316
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_or_si256(__m256i __a, __m256i __b)
Computes the bitwise OR of the 256-bit integer vectors in __a and __b.
Definition: avx2intrin.h:1817
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu16(__m256i __a, __m256i __b)
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsi...
Definition: avx2intrin.h:2685
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastq_epi64(__m128i __X)
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result'...
Definition: avx2intrin.h:3211
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits,...
Definition: avx2intrin.h:2373
static __inline__ void int __a
Definition: emmintrin.h:4058
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19