clang 20.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
54#define __DEFAULT_FN_ATTRS \
55 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
56 __min_vector_width__(256)))
57#define __DEFAULT_FN_ATTRS128 \
58 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
59 __min_vector_width__(128)))
60#else
61#define __DEFAULT_FN_ATTRS \
62 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
63 __min_vector_width__(256)))
64#define __DEFAULT_FN_ATTRS128 \
65 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
66 __min_vector_width__(128)))
67#endif
68
69#if defined(__cplusplus) && (__cplusplus >= 201103L)
70#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
71#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
72#else
73#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128
74#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS
75#endif
76
77/* Arithmetic */
78/// Adds two 256-bit vectors of [4 x double].
79///
80/// \headerfile <x86intrin.h>
81///
82/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
83///
84/// \param __a
85/// A 256-bit vector of [4 x double] containing one of the source operands.
86/// \param __b
87/// A 256-bit vector of [4 x double] containing one of the source operands.
88/// \returns A 256-bit vector of [4 x double] containing the sums of both
89/// operands.
90static __inline __m256d __DEFAULT_FN_ATTRS
91_mm256_add_pd(__m256d __a, __m256d __b)
92{
93 return (__m256d)((__v4df)__a+(__v4df)__b);
94}
95
96/// Adds two 256-bit vectors of [8 x float].
97///
98/// \headerfile <x86intrin.h>
99///
100/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
101///
102/// \param __a
103/// A 256-bit vector of [8 x float] containing one of the source operands.
104/// \param __b
105/// A 256-bit vector of [8 x float] containing one of the source operands.
106/// \returns A 256-bit vector of [8 x float] containing the sums of both
107/// operands.
108static __inline __m256 __DEFAULT_FN_ATTRS
109_mm256_add_ps(__m256 __a, __m256 __b)
110{
111 return (__m256)((__v8sf)__a+(__v8sf)__b);
112}
113
114/// Subtracts two 256-bit vectors of [4 x double].
115///
116/// \headerfile <x86intrin.h>
117///
118/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
119///
120/// \param __a
121/// A 256-bit vector of [4 x double] containing the minuend.
122/// \param __b
123/// A 256-bit vector of [4 x double] containing the subtrahend.
124/// \returns A 256-bit vector of [4 x double] containing the differences between
125/// both operands.
126static __inline __m256d __DEFAULT_FN_ATTRS
127_mm256_sub_pd(__m256d __a, __m256d __b)
128{
129 return (__m256d)((__v4df)__a-(__v4df)__b);
130}
131
132/// Subtracts two 256-bit vectors of [8 x float].
133///
134/// \headerfile <x86intrin.h>
135///
136/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
137///
138/// \param __a
139/// A 256-bit vector of [8 x float] containing the minuend.
140/// \param __b
141/// A 256-bit vector of [8 x float] containing the subtrahend.
142/// \returns A 256-bit vector of [8 x float] containing the differences between
143/// both operands.
144static __inline __m256 __DEFAULT_FN_ATTRS
145_mm256_sub_ps(__m256 __a, __m256 __b)
146{
147 return (__m256)((__v8sf)__a-(__v8sf)__b);
148}
149
150/// Adds the even-indexed values and subtracts the odd-indexed values of
151/// two 256-bit vectors of [4 x double].
152///
153/// \headerfile <x86intrin.h>
154///
155/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
156///
157/// \param __a
158/// A 256-bit vector of [4 x double] containing the left source operand.
159/// \param __b
160/// A 256-bit vector of [4 x double] containing the right source operand.
161/// \returns A 256-bit vector of [4 x double] containing the alternating sums
162/// and differences between both operands.
163static __inline __m256d __DEFAULT_FN_ATTRS
164_mm256_addsub_pd(__m256d __a, __m256d __b)
165{
166 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
167}
168
169/// Adds the even-indexed values and subtracts the odd-indexed values of
170/// two 256-bit vectors of [8 x float].
171///
172/// \headerfile <x86intrin.h>
173///
174/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
175///
176/// \param __a
177/// A 256-bit vector of [8 x float] containing the left source operand.
178/// \param __b
179/// A 256-bit vector of [8 x float] containing the right source operand.
180/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
181/// differences between both operands.
182static __inline __m256 __DEFAULT_FN_ATTRS
183_mm256_addsub_ps(__m256 __a, __m256 __b)
184{
185 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
186}
187
188/// Divides two 256-bit vectors of [4 x double].
189///
190/// \headerfile <x86intrin.h>
191///
192/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
193///
194/// \param __a
195/// A 256-bit vector of [4 x double] containing the dividend.
196/// \param __b
197/// A 256-bit vector of [4 x double] containing the divisor.
198/// \returns A 256-bit vector of [4 x double] containing the quotients of both
199/// operands.
200static __inline __m256d __DEFAULT_FN_ATTRS
201_mm256_div_pd(__m256d __a, __m256d __b)
202{
203 return (__m256d)((__v4df)__a/(__v4df)__b);
204}
205
206/// Divides two 256-bit vectors of [8 x float].
207///
208/// \headerfile <x86intrin.h>
209///
210/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
211///
212/// \param __a
213/// A 256-bit vector of [8 x float] containing the dividend.
214/// \param __b
215/// A 256-bit vector of [8 x float] containing the divisor.
216/// \returns A 256-bit vector of [8 x float] containing the quotients of both
217/// operands.
218static __inline __m256 __DEFAULT_FN_ATTRS
219_mm256_div_ps(__m256 __a, __m256 __b)
220{
221 return (__m256)((__v8sf)__a/(__v8sf)__b);
222}
223
224/// Compares two 256-bit vectors of [4 x double] and returns the greater
225/// of each pair of values.
226///
227/// If either value in a comparison is NaN, returns the value from \a __b.
228///
229/// \headerfile <x86intrin.h>
230///
231/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
232///
233/// \param __a
234/// A 256-bit vector of [4 x double] containing one of the operands.
235/// \param __b
236/// A 256-bit vector of [4 x double] containing one of the operands.
237/// \returns A 256-bit vector of [4 x double] containing the maximum values
238/// between both operands.
239static __inline __m256d __DEFAULT_FN_ATTRS
240_mm256_max_pd(__m256d __a, __m256d __b)
241{
242 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
243}
244
245/// Compares two 256-bit vectors of [8 x float] and returns the greater
246/// of each pair of values.
247///
248/// If either value in a comparison is NaN, returns the value from \a __b.
249///
250/// \headerfile <x86intrin.h>
251///
252/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
253///
254/// \param __a
255/// A 256-bit vector of [8 x float] containing one of the operands.
256/// \param __b
257/// A 256-bit vector of [8 x float] containing one of the operands.
258/// \returns A 256-bit vector of [8 x float] containing the maximum values
259/// between both operands.
260static __inline __m256 __DEFAULT_FN_ATTRS
261_mm256_max_ps(__m256 __a, __m256 __b)
262{
263 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
264}
265
266/// Compares two 256-bit vectors of [4 x double] and returns the lesser
267/// of each pair of values.
268///
269/// If either value in a comparison is NaN, returns the value from \a __b.
270///
271/// \headerfile <x86intrin.h>
272///
273/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
274///
275/// \param __a
276/// A 256-bit vector of [4 x double] containing one of the operands.
277/// \param __b
278/// A 256-bit vector of [4 x double] containing one of the operands.
279/// \returns A 256-bit vector of [4 x double] containing the minimum values
280/// between both operands.
281static __inline __m256d __DEFAULT_FN_ATTRS
282_mm256_min_pd(__m256d __a, __m256d __b)
283{
284 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
285}
286
287/// Compares two 256-bit vectors of [8 x float] and returns the lesser
288/// of each pair of values.
289///
290/// If either value in a comparison is NaN, returns the value from \a __b.
291///
292/// \headerfile <x86intrin.h>
293///
294/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
295///
296/// \param __a
297/// A 256-bit vector of [8 x float] containing one of the operands.
298/// \param __b
299/// A 256-bit vector of [8 x float] containing one of the operands.
300/// \returns A 256-bit vector of [8 x float] containing the minimum values
301/// between both operands.
302static __inline __m256 __DEFAULT_FN_ATTRS
303_mm256_min_ps(__m256 __a, __m256 __b)
304{
305 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
306}
307
308/// Multiplies two 256-bit vectors of [4 x double].
309///
310/// \headerfile <x86intrin.h>
311///
312/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
313///
314/// \param __a
315/// A 256-bit vector of [4 x double] containing one of the operands.
316/// \param __b
317/// A 256-bit vector of [4 x double] containing one of the operands.
318/// \returns A 256-bit vector of [4 x double] containing the products of both
319/// operands.
320static __inline __m256d __DEFAULT_FN_ATTRS
321_mm256_mul_pd(__m256d __a, __m256d __b)
322{
323 return (__m256d)((__v4df)__a * (__v4df)__b);
324}
325
326/// Multiplies two 256-bit vectors of [8 x float].
327///
328/// \headerfile <x86intrin.h>
329///
330/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
331///
332/// \param __a
333/// A 256-bit vector of [8 x float] containing one of the operands.
334/// \param __b
335/// A 256-bit vector of [8 x float] containing one of the operands.
336/// \returns A 256-bit vector of [8 x float] containing the products of both
337/// operands.
338static __inline __m256 __DEFAULT_FN_ATTRS
339_mm256_mul_ps(__m256 __a, __m256 __b)
340{
341 return (__m256)((__v8sf)__a * (__v8sf)__b);
342}
343
344/// Calculates the square roots of the values in a 256-bit vector of
345/// [4 x double].
346///
347/// \headerfile <x86intrin.h>
348///
349/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
350///
351/// \param __a
352/// A 256-bit vector of [4 x double].
353/// \returns A 256-bit vector of [4 x double] containing the square roots of the
354/// values in the operand.
355static __inline __m256d __DEFAULT_FN_ATTRS
357{
358 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
359}
360
361/// Calculates the square roots of the values in a 256-bit vector of
362/// [8 x float].
363///
364/// \headerfile <x86intrin.h>
365///
366/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
367///
368/// \param __a
369/// A 256-bit vector of [8 x float].
370/// \returns A 256-bit vector of [8 x float] containing the square roots of the
371/// values in the operand.
372static __inline __m256 __DEFAULT_FN_ATTRS
374{
375 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
376}
377
378/// Calculates the reciprocal square roots of the values in a 256-bit
379/// vector of [8 x float].
380///
381/// \headerfile <x86intrin.h>
382///
383/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
384///
385/// \param __a
386/// A 256-bit vector of [8 x float].
387/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
388/// roots of the values in the operand.
389static __inline __m256 __DEFAULT_FN_ATTRS
391{
392 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
393}
394
395/// Calculates the reciprocals of the values in a 256-bit vector of
396/// [8 x float].
397///
398/// \headerfile <x86intrin.h>
399///
400/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
401///
402/// \param __a
403/// A 256-bit vector of [8 x float].
404/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
405/// values in the operand.
406static __inline __m256 __DEFAULT_FN_ATTRS
408{
409 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
410}
411
412/// Rounds the values in a 256-bit vector of [4 x double] as specified
413/// by the byte operand. The source values are rounded to integer values and
414/// returned as 64-bit double-precision floating-point values.
415///
416/// \headerfile <x86intrin.h>
417///
418/// \code
419/// __m256d _mm256_round_pd(__m256d V, const int M);
420/// \endcode
421///
422/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
423///
424/// \param V
425/// A 256-bit vector of [4 x double].
426/// \param M
427/// An integer value that specifies the rounding operation. \n
428/// Bits [7:4] are reserved. \n
429/// Bit [3] is a precision exception value: \n
430/// 0: A normal PE exception is used. \n
431/// 1: The PE field is not updated. \n
432/// Bit [2] is the rounding control source: \n
433/// 0: Use bits [1:0] of \a M. \n
434/// 1: Use the current MXCSR setting. \n
435/// Bits [1:0] contain the rounding control definition: \n
436/// 00: Nearest. \n
437/// 01: Downward (toward negative infinity). \n
438/// 10: Upward (toward positive infinity). \n
439/// 11: Truncated.
440/// \returns A 256-bit vector of [4 x double] containing the rounded values.
441#define _mm256_round_pd(V, M) \
442 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
443
444/// Rounds the values stored in a 256-bit vector of [8 x float] as
445/// specified by the byte operand. The source values are rounded to integer
446/// values and returned as floating-point values.
447///
448/// \headerfile <x86intrin.h>
449///
450/// \code
451/// __m256 _mm256_round_ps(__m256 V, const int M);
452/// \endcode
453///
454/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
455///
456/// \param V
457/// A 256-bit vector of [8 x float].
458/// \param M
459/// An integer value that specifies the rounding operation. \n
460/// Bits [7:4] are reserved. \n
461/// Bit [3] is a precision exception value: \n
462/// 0: A normal PE exception is used. \n
463/// 1: The PE field is not updated. \n
464/// Bit [2] is the rounding control source: \n
465/// 0: Use bits [1:0] of \a M. \n
466/// 1: Use the current MXCSR setting. \n
467/// Bits [1:0] contain the rounding control definition: \n
468/// 00: Nearest. \n
469/// 01: Downward (toward negative infinity). \n
470/// 10: Upward (toward positive infinity). \n
471/// 11: Truncated.
472/// \returns A 256-bit vector of [8 x float] containing the rounded values.
473#define _mm256_round_ps(V, M) \
474 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
475
476/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
477/// source values are rounded up to integer values and returned as 64-bit
478/// double-precision floating-point values.
479///
480/// \headerfile <x86intrin.h>
481///
482/// \code
483/// __m256d _mm256_ceil_pd(__m256d V);
484/// \endcode
485///
486/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
487///
488/// \param V
489/// A 256-bit vector of [4 x double].
490/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
491#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
492
493/// Rounds down the values stored in a 256-bit vector of [4 x double].
494/// The source values are rounded down to integer values and returned as
495/// 64-bit double-precision floating-point values.
496///
497/// \headerfile <x86intrin.h>
498///
499/// \code
500/// __m256d _mm256_floor_pd(__m256d V);
501/// \endcode
502///
503/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
504///
505/// \param V
506/// A 256-bit vector of [4 x double].
507/// \returns A 256-bit vector of [4 x double] containing the rounded down
508/// values.
509#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
510
511/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
512/// source values are rounded up to integer values and returned as
513/// floating-point values.
514///
515/// \headerfile <x86intrin.h>
516///
517/// \code
518/// __m256 _mm256_ceil_ps(__m256 V);
519/// \endcode
520///
521/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
522///
523/// \param V
524/// A 256-bit vector of [8 x float].
525/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
526#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
527
528/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
529/// source values are rounded down to integer values and returned as
530/// floating-point values.
531///
532/// \headerfile <x86intrin.h>
533///
534/// \code
535/// __m256 _mm256_floor_ps(__m256 V);
536/// \endcode
537///
538/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
539///
540/// \param V
541/// A 256-bit vector of [8 x float].
542/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
543#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
544
545/* Logical */
546/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
547///
548/// \headerfile <x86intrin.h>
549///
550/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
551///
552/// \param __a
553/// A 256-bit vector of [4 x double] containing one of the source operands.
554/// \param __b
555/// A 256-bit vector of [4 x double] containing one of the source operands.
556/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
557/// values between both operands.
558static __inline __m256d __DEFAULT_FN_ATTRS
559_mm256_and_pd(__m256d __a, __m256d __b)
560{
561 return (__m256d)((__v4du)__a & (__v4du)__b);
562}
563
564/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
565///
566/// \headerfile <x86intrin.h>
567///
568/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
569///
570/// \param __a
571/// A 256-bit vector of [8 x float] containing one of the source operands.
572/// \param __b
573/// A 256-bit vector of [8 x float] containing one of the source operands.
574/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
575/// values between both operands.
576static __inline __m256 __DEFAULT_FN_ATTRS
577_mm256_and_ps(__m256 __a, __m256 __b)
578{
579 return (__m256)((__v8su)__a & (__v8su)__b);
580}
581
582/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
583/// the one's complement of the values contained in the first source operand.
584///
585/// \headerfile <x86intrin.h>
586///
587/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
588///
589/// \param __a
590/// A 256-bit vector of [4 x double] containing the left source operand. The
591/// one's complement of this value is used in the bitwise AND.
592/// \param __b
593/// A 256-bit vector of [4 x double] containing the right source operand.
594/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
595/// values of the second operand and the one's complement of the first
596/// operand.
597static __inline __m256d __DEFAULT_FN_ATTRS
598_mm256_andnot_pd(__m256d __a, __m256d __b)
599{
600 return (__m256d)(~(__v4du)__a & (__v4du)__b);
601}
602
603/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
604/// the one's complement of the values contained in the first source operand.
605///
606/// \headerfile <x86intrin.h>
607///
608/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
609///
610/// \param __a
611/// A 256-bit vector of [8 x float] containing the left source operand. The
612/// one's complement of this value is used in the bitwise AND.
613/// \param __b
614/// A 256-bit vector of [8 x float] containing the right source operand.
615/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
616/// values of the second operand and the one's complement of the first
617/// operand.
618static __inline __m256 __DEFAULT_FN_ATTRS
619_mm256_andnot_ps(__m256 __a, __m256 __b)
620{
621 return (__m256)(~(__v8su)__a & (__v8su)__b);
622}
623
624/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
625///
626/// \headerfile <x86intrin.h>
627///
628/// This intrinsic corresponds to the <c> VORPD </c> instruction.
629///
630/// \param __a
631/// A 256-bit vector of [4 x double] containing one of the source operands.
632/// \param __b
633/// A 256-bit vector of [4 x double] containing one of the source operands.
634/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
635/// values between both operands.
636static __inline __m256d __DEFAULT_FN_ATTRS
637_mm256_or_pd(__m256d __a, __m256d __b)
638{
639 return (__m256d)((__v4du)__a | (__v4du)__b);
640}
641
642/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
643///
644/// \headerfile <x86intrin.h>
645///
646/// This intrinsic corresponds to the <c> VORPS </c> instruction.
647///
648/// \param __a
649/// A 256-bit vector of [8 x float] containing one of the source operands.
650/// \param __b
651/// A 256-bit vector of [8 x float] containing one of the source operands.
652/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
653/// values between both operands.
654static __inline __m256 __DEFAULT_FN_ATTRS
655_mm256_or_ps(__m256 __a, __m256 __b)
656{
657 return (__m256)((__v8su)__a | (__v8su)__b);
658}
659
660/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
661///
662/// \headerfile <x86intrin.h>
663///
664/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
665///
666/// \param __a
667/// A 256-bit vector of [4 x double] containing one of the source operands.
668/// \param __b
669/// A 256-bit vector of [4 x double] containing one of the source operands.
670/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
671/// values between both operands.
672static __inline __m256d __DEFAULT_FN_ATTRS
673_mm256_xor_pd(__m256d __a, __m256d __b)
674{
675 return (__m256d)((__v4du)__a ^ (__v4du)__b);
676}
677
678/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
679///
680/// \headerfile <x86intrin.h>
681///
682/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
683///
684/// \param __a
685/// A 256-bit vector of [8 x float] containing one of the source operands.
686/// \param __b
687/// A 256-bit vector of [8 x float] containing one of the source operands.
688/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
689/// values between both operands.
690static __inline __m256 __DEFAULT_FN_ATTRS
691_mm256_xor_ps(__m256 __a, __m256 __b)
692{
693 return (__m256)((__v8su)__a ^ (__v8su)__b);
694}
695
696/* Horizontal arithmetic */
697/// Horizontally adds the adjacent pairs of values contained in two
698/// 256-bit vectors of [4 x double].
699///
700/// \headerfile <x86intrin.h>
701///
702/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
703///
704/// \param __a
705/// A 256-bit vector of [4 x double] containing one of the source operands.
706/// The horizontal sums of the values are returned in the even-indexed
707/// elements of a vector of [4 x double].
708/// \param __b
709/// A 256-bit vector of [4 x double] containing one of the source operands.
710/// The horizontal sums of the values are returned in the odd-indexed
711/// elements of a vector of [4 x double].
712/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
713/// both operands.
714static __inline __m256d __DEFAULT_FN_ATTRS
715_mm256_hadd_pd(__m256d __a, __m256d __b)
716{
717 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
718}
719
720/// Horizontally adds the adjacent pairs of values contained in two
721/// 256-bit vectors of [8 x float].
722///
723/// \headerfile <x86intrin.h>
724///
725/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
726///
727/// \param __a
728/// A 256-bit vector of [8 x float] containing one of the source operands.
729/// The horizontal sums of the values are returned in the elements with
730/// index 0, 1, 4, 5 of a vector of [8 x float].
731/// \param __b
732/// A 256-bit vector of [8 x float] containing one of the source operands.
733/// The horizontal sums of the values are returned in the elements with
734/// index 2, 3, 6, 7 of a vector of [8 x float].
735/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
736/// both operands.
737static __inline __m256 __DEFAULT_FN_ATTRS
738_mm256_hadd_ps(__m256 __a, __m256 __b)
739{
740 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
741}
742
743/// Horizontally subtracts the adjacent pairs of values contained in two
744/// 256-bit vectors of [4 x double].
745///
746/// \headerfile <x86intrin.h>
747///
748/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
749///
750/// \param __a
751/// A 256-bit vector of [4 x double] containing one of the source operands.
752/// The horizontal differences between the values are returned in the
753/// even-indexed elements of a vector of [4 x double].
754/// \param __b
755/// A 256-bit vector of [4 x double] containing one of the source operands.
756/// The horizontal differences between the values are returned in the
757/// odd-indexed elements of a vector of [4 x double].
758/// \returns A 256-bit vector of [4 x double] containing the horizontal
759/// differences of both operands.
760static __inline __m256d __DEFAULT_FN_ATTRS
761_mm256_hsub_pd(__m256d __a, __m256d __b)
762{
763 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
764}
765
766/// Horizontally subtracts the adjacent pairs of values contained in two
767/// 256-bit vectors of [8 x float].
768///
769/// \headerfile <x86intrin.h>
770///
771/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
772///
773/// \param __a
774/// A 256-bit vector of [8 x float] containing one of the source operands.
775/// The horizontal differences between the values are returned in the
776/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
777/// \param __b
778/// A 256-bit vector of [8 x float] containing one of the source operands.
779/// The horizontal differences between the values are returned in the
780/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
781/// \returns A 256-bit vector of [8 x float] containing the horizontal
782/// differences of both operands.
783static __inline __m256 __DEFAULT_FN_ATTRS
784_mm256_hsub_ps(__m256 __a, __m256 __b)
785{
786 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
787}
788
789/* Vector permutations */
790/// Copies the values in a 128-bit vector of [2 x double] as specified
791/// by the 128-bit integer vector operand.
792///
793/// \headerfile <x86intrin.h>
794///
795/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
796///
797/// \param __a
798/// A 128-bit vector of [2 x double].
799/// \param __c
800/// A 128-bit integer vector operand specifying how the values are to be
801/// copied. \n
802/// Bit [1]: \n
803/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
804/// vector. \n
805/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
806/// returned vector. \n
807/// Bit [65]: \n
808/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
809/// returned vector. \n
810/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
811/// returned vector.
812/// \returns A 128-bit vector of [2 x double] containing the copied values.
813static __inline __m128d __DEFAULT_FN_ATTRS128
814_mm_permutevar_pd(__m128d __a, __m128i __c)
815{
816 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
817}
818
819/// Copies the values in a 256-bit vector of [4 x double] as specified
820/// by the 256-bit integer vector operand.
821///
822/// \headerfile <x86intrin.h>
823///
824/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
825///
826/// \param __a
827/// A 256-bit vector of [4 x double].
828/// \param __c
829/// A 256-bit integer vector operand specifying how the values are to be
830/// copied. \n
831/// Bit [1]: \n
832/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
833/// vector. \n
834/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
835/// returned vector. \n
836/// Bit [65]: \n
837/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
838/// returned vector. \n
839/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
840/// returned vector. \n
841/// Bit [129]: \n
842/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
843/// returned vector. \n
844/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
845/// returned vector. \n
846/// Bit [193]: \n
847/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
848/// returned vector. \n
849/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
850/// returned vector.
851/// \returns A 256-bit vector of [4 x double] containing the copied values.
852static __inline __m256d __DEFAULT_FN_ATTRS
853_mm256_permutevar_pd(__m256d __a, __m256i __c)
854{
855 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
856}
857
858/// Copies the values stored in a 128-bit vector of [4 x float] as
859/// specified by the 128-bit integer vector operand.
860///
861/// \headerfile <x86intrin.h>
862///
863/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
864///
865/// \param __a
866/// A 128-bit vector of [4 x float].
867/// \param __c
868/// A 128-bit integer vector operand specifying how the values are to be
869/// copied. \n
870/// Bits [1:0]: \n
871/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
872/// returned vector. \n
873/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
874/// returned vector. \n
875/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
876/// returned vector. \n
877/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
878/// returned vector. \n
879/// Bits [33:32]: \n
880/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
881/// returned vector. \n
882/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
883/// returned vector. \n
884/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
885/// returned vector. \n
886/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
887/// returned vector. \n
888/// Bits [65:64]: \n
889/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
890/// returned vector. \n
891/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
892/// returned vector. \n
893/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
894/// returned vector. \n
895/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
896/// returned vector. \n
897/// Bits [97:96]: \n
898/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
899/// returned vector. \n
900/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
901/// returned vector. \n
902/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
903/// returned vector. \n
904/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
905/// returned vector.
906/// \returns A 128-bit vector of [4 x float] containing the copied values.
907static __inline __m128 __DEFAULT_FN_ATTRS128
908_mm_permutevar_ps(__m128 __a, __m128i __c)
909{
910 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
911}
912
913/// Copies the values stored in a 256-bit vector of [8 x float] as
914/// specified by the 256-bit integer vector operand.
915///
916/// \headerfile <x86intrin.h>
917///
918/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
919///
920/// \param __a
921/// A 256-bit vector of [8 x float].
922/// \param __c
923/// A 256-bit integer vector operand specifying how the values are to be
924/// copied. \n
925/// Bits [1:0]: \n
926/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
927/// returned vector. \n
928/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
929/// returned vector. \n
930/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
931/// returned vector. \n
932/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
933/// returned vector. \n
934/// Bits [33:32]: \n
935/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
936/// returned vector. \n
937/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
938/// returned vector. \n
939/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
940/// returned vector. \n
941/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
942/// returned vector. \n
943/// Bits [65:64]: \n
944/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
945/// returned vector. \n
946/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
947/// returned vector. \n
948/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
949/// returned vector. \n
950/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
951/// returned vector. \n
952/// Bits [97:96]: \n
953/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
954/// returned vector. \n
955/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
956/// returned vector. \n
957/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
958/// returned vector. \n
959/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
960/// returned vector. \n
961/// Bits [129:128]: \n
962/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
963/// returned vector. \n
964/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
965/// returned vector. \n
966/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
967/// returned vector. \n
968/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
969/// returned vector. \n
970/// Bits [161:160]: \n
971/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
972/// returned vector. \n
973/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
974/// returned vector. \n
975/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
976/// returned vector. \n
977/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
978/// returned vector. \n
979/// Bits [193:192]: \n
980/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
981/// returned vector. \n
982/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
983/// returned vector. \n
984/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
985/// returned vector. \n
986/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
987/// returned vector. \n
988/// Bits [225:224]: \n
989/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
990/// returned vector. \n
991/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
992/// returned vector. \n
993/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
994/// returned vector. \n
995/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
996/// returned vector.
997/// \returns A 256-bit vector of [8 x float] containing the copied values.
998static __inline __m256 __DEFAULT_FN_ATTRS
1000{
1001 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
1002}
1003
1004/// Copies the values in a 128-bit vector of [2 x double] as specified
1005/// by the immediate integer operand.
1006///
1007/// \headerfile <x86intrin.h>
1008///
1009/// \code
1010/// __m128d _mm_permute_pd(__m128d A, const int C);
1011/// \endcode
1012///
1013/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1014///
1015/// \param A
1016/// A 128-bit vector of [2 x double].
1017/// \param C
1018/// An immediate integer operand specifying how the values are to be
1019/// copied. \n
1020/// Bit [0]: \n
1021/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1022/// vector. \n
1023/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1024/// returned vector. \n
1025/// Bit [1]: \n
1026/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1027/// returned vector. \n
1028/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1029/// returned vector.
1030/// \returns A 128-bit vector of [2 x double] containing the copied values.
1031#define _mm_permute_pd(A, C) \
1032 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1033
1034/// Copies the values in a 256-bit vector of [4 x double] as specified by
1035/// the immediate integer operand.
1036///
1037/// \headerfile <x86intrin.h>
1038///
1039/// \code
1040/// __m256d _mm256_permute_pd(__m256d A, const int C);
1041/// \endcode
1042///
1043/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1044///
1045/// \param A
1046/// A 256-bit vector of [4 x double].
1047/// \param C
1048/// An immediate integer operand specifying how the values are to be
1049/// copied. \n
1050/// Bit [0]: \n
1051/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1052/// vector. \n
1053/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1054/// returned vector. \n
1055/// Bit [1]: \n
1056/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1057/// returned vector. \n
1058/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1059/// returned vector. \n
1060/// Bit [2]: \n
1061/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1062/// returned vector. \n
1063/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1064/// returned vector. \n
1065/// Bit [3]: \n
1066/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1067/// returned vector. \n
1068/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1069/// returned vector.
1070/// \returns A 256-bit vector of [4 x double] containing the copied values.
1071#define _mm256_permute_pd(A, C) \
1072 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1073
1074/// Copies the values in a 128-bit vector of [4 x float] as specified by
1075/// the immediate integer operand.
1076///
1077/// \headerfile <x86intrin.h>
1078///
1079/// \code
1080/// __m128 _mm_permute_ps(__m128 A, const int C);
1081/// \endcode
1082///
1083/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1084///
1085/// \param A
1086/// A 128-bit vector of [4 x float].
1087/// \param C
1088/// An immediate integer operand specifying how the values are to be
1089/// copied. \n
1090/// Bits [1:0]: \n
1091/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1092/// returned vector. \n
1093/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1094/// returned vector. \n
1095/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1096/// returned vector. \n
1097/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1098/// returned vector. \n
1099/// Bits [3:2]: \n
1100/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1101/// returned vector. \n
1102/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1103/// returned vector. \n
1104/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1105/// returned vector. \n
1106/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1107/// returned vector. \n
1108/// Bits [5:4]: \n
1109/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1110/// returned vector. \n
1111/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1112/// returned vector. \n
1113/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1114/// returned vector. \n
1115/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1116/// returned vector. \n
1117/// Bits [7:6]: \n
1118/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1119/// returned vector. \n
1120/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1121/// returned vector. \n
1122/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1123/// returned vector. \n
1124/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1125/// returned vector.
1126/// \returns A 128-bit vector of [4 x float] containing the copied values.
1127#define _mm_permute_ps(A, C) \
1128 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1129
1130/// Copies the values in a 256-bit vector of [8 x float] as specified by
1131/// the immediate integer operand.
1132///
1133/// \headerfile <x86intrin.h>
1134///
1135/// \code
1136/// __m256 _mm256_permute_ps(__m256 A, const int C);
1137/// \endcode
1138///
1139/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1140///
1141/// \param A
1142/// A 256-bit vector of [8 x float].
1143/// \param C
1144/// An immediate integer operand specifying how the values are to be
1145/// copied. \n
1146/// Bits [1:0]: \n
1147/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1148/// returned vector. \n
1149/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1150/// returned vector. \n
1151/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1152/// returned vector. \n
1153/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1154/// returned vector. \n
1155/// Bits [3:2]: \n
1156/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1157/// returned vector. \n
1158/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1159/// returned vector. \n
1160/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1161/// returned vector. \n
1162/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1163/// returned vector. \n
1164/// Bits [5:4]: \n
1165/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1166/// returned vector. \n
1167/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1168/// returned vector. \n
1169/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1170/// returned vector. \n
1171/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1172/// returned vector. \n
1173/// Bits [7:6]: \n
1174/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1175/// returned vector. \n
1176/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1177/// returned vector. \n
1178/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1179/// returned vector. \n
1180/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1181/// returned vector. \n
1182/// Bits [1:0]: \n
1183/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1184/// returned vector. \n
1185/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1186/// returned vector. \n
1187/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1188/// returned vector. \n
1189/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1190/// returned vector. \n
1191/// Bits [3:2]: \n
1192/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1193/// returned vector. \n
1194/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1195/// returned vector. \n
1196/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1197/// returned vector. \n
1198/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1199/// returned vector. \n
1200/// Bits [5:4]: \n
1201/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1202/// returned vector. \n
1203/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1204/// returned vector. \n
1205/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1206/// returned vector. \n
1207/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1208/// returned vector. \n
1209/// Bits [7:6]: \n
1210/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1211/// returned vector. \n
1212/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1213/// returned vector. \n
1214/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1215/// returned vector. \n
1216/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1217/// returned vector.
1218/// \returns A 256-bit vector of [8 x float] containing the copied values.
1219#define _mm256_permute_ps(A, C) \
1220 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1221
1222/// Permutes 128-bit data values stored in two 256-bit vectors of
1223/// [4 x double], as specified by the immediate integer operand.
1224///
1225/// \headerfile <x86intrin.h>
1226///
1227/// \code
1228/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1229/// \endcode
1230///
1231/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1232///
1233/// \param V1
1234/// A 256-bit vector of [4 x double].
1235/// \param V2
1236/// A 256-bit vector of [4 x double.
1237/// \param M
1238/// An immediate integer operand specifying how the values are to be
1239/// permuted. \n
1240/// Bits [1:0]: \n
1241/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1242/// destination. \n
1243/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1244/// destination. \n
1245/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1246/// destination. \n
1247/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1248/// destination. \n
1249/// Bits [5:4]: \n
1250/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1251/// destination. \n
1252/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1253/// destination. \n
1254/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1255/// destination. \n
1256/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1257/// destination.
1258/// \returns A 256-bit vector of [4 x double] containing the copied values.
1259#define _mm256_permute2f128_pd(V1, V2, M) \
1260 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1261 (__v4df)(__m256d)(V2), (int)(M)))
1262
1263/// Permutes 128-bit data values stored in two 256-bit vectors of
1264/// [8 x float], as specified by the immediate integer operand.
1265///
1266/// \headerfile <x86intrin.h>
1267///
1268/// \code
1269/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1270/// \endcode
1271///
1272/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1273///
1274/// \param V1
1275/// A 256-bit vector of [8 x float].
1276/// \param V2
1277/// A 256-bit vector of [8 x float].
1278/// \param M
1279/// An immediate integer operand specifying how the values are to be
1280/// permuted. \n
1281/// Bits [1:0]: \n
1282/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1283/// destination. \n
1284/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1285/// destination. \n
1286/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1287/// destination. \n
1288/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1289/// destination. \n
1290/// Bits [5:4]: \n
1291/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1292/// destination. \n
1293/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1294/// destination. \n
1295/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1296/// destination. \n
1297/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1298/// destination.
1299/// \returns A 256-bit vector of [8 x float] containing the copied values.
1300#define _mm256_permute2f128_ps(V1, V2, M) \
1301 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1302 (__v8sf)(__m256)(V2), (int)(M)))
1303
1304/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1305/// as specified by the immediate integer operand.
1306///
1307/// \headerfile <x86intrin.h>
1308///
1309/// \code
1310/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1311/// \endcode
1312///
1313/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1314///
1315/// \param V1
1316/// A 256-bit integer vector.
1317/// \param V2
1318/// A 256-bit integer vector.
1319/// \param M
1320/// An immediate integer operand specifying how the values are to be copied.
1321/// Bits [1:0]: \n
1322/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1323/// destination. \n
1324/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1325/// destination. \n
1326/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1327/// destination. \n
1328/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1329/// destination. \n
1330/// Bits [5:4]: \n
1331/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1332/// destination. \n
1333/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1334/// destination. \n
1335/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1336/// destination. \n
1337/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1338/// destination.
1339/// \returns A 256-bit integer vector containing the copied values.
1340#define _mm256_permute2f128_si256(V1, V2, M) \
1341 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1342 (__v8si)(__m256i)(V2), (int)(M)))
1343
1344/* Vector Blend */
1345/// Merges 64-bit double-precision data values stored in either of the
1346/// two 256-bit vectors of [4 x double], as specified by the immediate
1347/// integer operand.
1348///
1349/// \headerfile <x86intrin.h>
1350///
1351/// \code
1352/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1353/// \endcode
1354///
1355/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1356///
1357/// \param V1
1358/// A 256-bit vector of [4 x double].
1359/// \param V2
1360/// A 256-bit vector of [4 x double].
1361/// \param M
1362/// An immediate integer operand, with mask bits [3:0] specifying how the
1363/// values are to be copied. The position of the mask bit corresponds to the
1364/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1365/// element in operand \a V1 is copied to the same position in the
1366/// destination. When a mask bit is 1, the corresponding 64-bit element in
1367/// operand \a V2 is copied to the same position in the destination.
1368/// \returns A 256-bit vector of [4 x double] containing the copied values.
1369#define _mm256_blend_pd(V1, V2, M) \
1370 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1371 (__v4df)(__m256d)(V2), (int)(M)))
1372
1373/// Merges 32-bit single-precision data values stored in either of the
1374/// two 256-bit vectors of [8 x float], as specified by the immediate
1375/// integer operand.
1376///
1377/// \headerfile <x86intrin.h>
1378///
1379/// \code
1380/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1381/// \endcode
1382///
1383/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1384///
1385/// \param V1
1386/// A 256-bit vector of [8 x float].
1387/// \param V2
1388/// A 256-bit vector of [8 x float].
1389/// \param M
1390/// An immediate integer operand, with mask bits [7:0] specifying how the
1391/// values are to be copied. The position of the mask bit corresponds to the
1392/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1393/// element in operand \a V1 is copied to the same position in the
1394/// destination. When a mask bit is 1, the corresponding 32-bit element in
1395/// operand \a V2 is copied to the same position in the destination.
1396/// \returns A 256-bit vector of [8 x float] containing the copied values.
1397#define _mm256_blend_ps(V1, V2, M) \
1398 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1399 (__v8sf)(__m256)(V2), (int)(M)))
1400
1401/// Merges 64-bit double-precision data values stored in either of the
1402/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1403/// operand.
1404///
1405/// \headerfile <x86intrin.h>
1406///
1407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1408///
1409/// \param __a
1410/// A 256-bit vector of [4 x double].
1411/// \param __b
1412/// A 256-bit vector of [4 x double].
1413/// \param __c
1414/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1415/// how the values are to be copied. The position of the mask bit corresponds
1416/// to the most significant bit of a copied value. When a mask bit is 0, the
1417/// corresponding 64-bit element in operand \a __a is copied to the same
1418/// position in the destination. When a mask bit is 1, the corresponding
1419/// 64-bit element in operand \a __b is copied to the same position in the
1420/// destination.
1421/// \returns A 256-bit vector of [4 x double] containing the copied values.
1422static __inline __m256d __DEFAULT_FN_ATTRS
1423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1424{
1425 return (__m256d)__builtin_ia32_blendvpd256(
1426 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1427}
1428
1429/// Merges 32-bit single-precision data values stored in either of the
1430/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1431/// operand.
1432///
1433/// \headerfile <x86intrin.h>
1434///
1435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1436///
1437/// \param __a
1438/// A 256-bit vector of [8 x float].
1439/// \param __b
1440/// A 256-bit vector of [8 x float].
1441/// \param __c
1442/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1443/// and 31 specifying how the values are to be copied. The position of the
1444/// mask bit corresponds to the most significant bit of a copied value. When
1445/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1446/// copied to the same position in the destination. When a mask bit is 1, the
1447/// corresponding 32-bit element in operand \a __b is copied to the same
1448/// position in the destination.
1449/// \returns A 256-bit vector of [8 x float] containing the copied values.
1450static __inline __m256 __DEFAULT_FN_ATTRS
1451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1452{
1453 return (__m256)__builtin_ia32_blendvps256(
1454 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1455}
1456
1457/* Vector Dot Product */
1458/// Computes two dot products in parallel, using the lower and upper
1459/// halves of two [8 x float] vectors as input to the two computations, and
1460/// returning the two dot products in the lower and upper halves of the
1461/// [8 x float] result.
1462///
1463/// The immediate integer operand controls which input elements will
1464/// contribute to the dot product, and where the final results are returned.
1465/// In general, for each dot product, the four corresponding elements of the
1466/// input vectors are multiplied; the first two and second two products are
1467/// summed, then the two sums are added to form the final result.
1468///
1469/// \headerfile <x86intrin.h>
1470///
1471/// \code
1472/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1473/// \endcode
1474///
1475/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1476///
1477/// \param V1
1478/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1479/// \param V2
1480/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1481/// \param M
1482/// An immediate integer argument. Bits [7:4] determine which elements of
1483/// the input vectors are used, with bit [4] corresponding to the lowest
1484/// element and bit [7] corresponding to the highest element of each [4 x
1485/// float] subvector. If a bit is set, the corresponding elements from the
1486/// two input vectors are used as an input for dot product; otherwise that
1487/// input is treated as zero. Bits [3:0] determine which elements of the
1488/// result will receive a copy of the final dot product, with bit [0]
1489/// corresponding to the lowest element and bit [3] corresponding to the
1490/// highest element of each [4 x float] subvector. If a bit is set, the dot
1491/// product is returned in the corresponding element; otherwise that element
1492/// is set to zero. The bitmask is applied in the same way to each of the
1493/// two parallel dot product computations.
1494/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1495#define _mm256_dp_ps(V1, V2, M) \
1496 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1497 (__v8sf)(__m256)(V2), (M)))
1498
1499/* Vector shuffle */
1500/// Selects 8 float values from the 256-bit operands of [8 x float], as
1501/// specified by the immediate value operand.
1502///
1503/// The four selected elements in each operand are copied to the destination
1504/// according to the bits specified in the immediate operand. The selected
1505/// elements from the first 256-bit operand are copied to bits [63:0] and
1506/// bits [191:128] of the destination, and the selected elements from the
1507/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1508/// the destination. For example, if bits [7:0] of the immediate operand
1509/// contain a value of 0xFF, the 256-bit destination vector would contain the
1510/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1511///
1512/// \headerfile <x86intrin.h>
1513///
1514/// \code
1515/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1516/// \endcode
1517///
1518/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1519///
1520/// \param a
1521/// A 256-bit vector of [8 x float]. The four selected elements in this
1522/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1523/// according to the bits specified in the immediate operand.
1524/// \param b
1525/// A 256-bit vector of [8 x float]. The four selected elements in this
1526/// operand are copied to bits [127:64] and bits [255:192] in the
1527/// destination, according to the bits specified in the immediate operand.
1528/// \param mask
1529/// An immediate value containing an 8-bit value specifying which elements to
1530/// copy from \a a and \a b \n.
1531/// Bits [3:0] specify the values copied from operand \a a. \n
1532/// Bits [7:4] specify the values copied from operand \a b. \n
1533/// The destinations within the 256-bit destination are assigned values as
1534/// follows, according to the bit value assignments described below: \n
1535/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1536/// destination. \n
1537/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1538/// destination. \n
1539/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1540/// destination. \n
1541/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1542/// the destination. \n
1543/// Bit value assignments: \n
1544/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1545/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1546/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1547/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1548/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1549/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1550/// <c>[b6, b4, b2, b0]</c>.
1551/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1552#define _mm256_shuffle_ps(a, b, mask) \
1553 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1554 (__v8sf)(__m256)(b), (int)(mask)))
1555
1556/// Selects four double-precision values from the 256-bit operands of
1557/// [4 x double], as specified by the immediate value operand.
1558///
1559/// The selected elements from the first 256-bit operand are copied to bits
1560/// [63:0] and bits [191:128] in the destination, and the selected elements
1561/// from the second 256-bit operand are copied to bits [127:64] and bits
1562/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1563/// operand contain a value of 0xF, the 256-bit destination vector would
1564/// contain the following values: b[3], a[3], b[1], a[1].
1565///
1566/// \headerfile <x86intrin.h>
1567///
1568/// \code
1569/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1570/// \endcode
1571///
1572/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1573///
1574/// \param a
1575/// A 256-bit vector of [4 x double].
1576/// \param b
1577/// A 256-bit vector of [4 x double].
1578/// \param mask
1579/// An immediate value containing 8-bit values specifying which elements to
1580/// copy from \a a and \a b: \n
1581/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1582/// destination. \n
1583/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1584/// destination. \n
1585/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1586/// destination. \n
1587/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1588/// destination. \n
1589/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1590/// destination. \n
1591/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1592/// destination. \n
1593/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1594/// destination. \n
1595/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1596/// destination.
1597/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1598#define _mm256_shuffle_pd(a, b, mask) \
1599 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1600 (__v4df)(__m256d)(b), (int)(mask)))
1601
1602/* Compare */
1603#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1604#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1605#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1606#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1607#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1608#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1609#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1610#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1611#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1612#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1613#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1614#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1615#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1616#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1617#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1618#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1619#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1620#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1621#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1622#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1623#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1624#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1625#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1626#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1627
1628/* Below intrinsic defined in emmintrin.h can be used for AVX */
1629/// Compares each of the corresponding double-precision values of two
1630/// 128-bit vectors of [2 x double], using the operation specified by the
1631/// immediate integer operand.
1632///
1633/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1634/// If either value in a comparison is NaN, comparisons that are ordered
1635/// return false, and comparisons that are unordered return true.
1636///
1637/// \headerfile <x86intrin.h>
1638///
1639/// \code
1640/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1641/// \endcode
1642///
1643/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1644///
1645/// \param a
1646/// A 128-bit vector of [2 x double].
1647/// \param b
1648/// A 128-bit vector of [2 x double].
1649/// \param c
1650/// An immediate integer operand, with bits [4:0] specifying which comparison
1651/// operation to use: \n
1652/// 0x00: Equal (ordered, non-signaling) \n
1653/// 0x01: Less-than (ordered, signaling) \n
1654/// 0x02: Less-than-or-equal (ordered, signaling) \n
1655/// 0x03: Unordered (non-signaling) \n
1656/// 0x04: Not-equal (unordered, non-signaling) \n
1657/// 0x05: Not-less-than (unordered, signaling) \n
1658/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1659/// 0x07: Ordered (non-signaling) \n
1660/// 0x08: Equal (unordered, non-signaling) \n
1661/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1662/// 0x0A: Not-greater-than (unordered, signaling) \n
1663/// 0x0B: False (ordered, non-signaling) \n
1664/// 0x0C: Not-equal (ordered, non-signaling) \n
1665/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1666/// 0x0E: Greater-than (ordered, signaling) \n
1667/// 0x0F: True (unordered, non-signaling) \n
1668/// 0x10: Equal (ordered, signaling) \n
1669/// 0x11: Less-than (ordered, non-signaling) \n
1670/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1671/// 0x13: Unordered (signaling) \n
1672/// 0x14: Not-equal (unordered, signaling) \n
1673/// 0x15: Not-less-than (unordered, non-signaling) \n
1674/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1675/// 0x17: Ordered (signaling) \n
1676/// 0x18: Equal (unordered, signaling) \n
1677/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1678/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1679/// 0x1B: False (ordered, signaling) \n
1680/// 0x1C: Not-equal (ordered, signaling) \n
1681/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1682/// 0x1E: Greater-than (ordered, non-signaling) \n
1683/// 0x1F: True (unordered, signaling)
1684/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1685/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1686
1687/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1688/// Compares each of the corresponding values of two 128-bit vectors of
1689/// [4 x float], using the operation specified by the immediate integer
1690/// operand.
1691///
1692/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1693/// If either value in a comparison is NaN, comparisons that are ordered
1694/// return false, and comparisons that are unordered return true.
1695///
1696/// \headerfile <x86intrin.h>
1697///
1698/// \code
1699/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1700/// \endcode
1701///
1702/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1703///
1704/// \param a
1705/// A 128-bit vector of [4 x float].
1706/// \param b
1707/// A 128-bit vector of [4 x float].
1708/// \param c
1709/// An immediate integer operand, with bits [4:0] specifying which comparison
1710/// operation to use: \n
1711/// 0x00: Equal (ordered, non-signaling) \n
1712/// 0x01: Less-than (ordered, signaling) \n
1713/// 0x02: Less-than-or-equal (ordered, signaling) \n
1714/// 0x03: Unordered (non-signaling) \n
1715/// 0x04: Not-equal (unordered, non-signaling) \n
1716/// 0x05: Not-less-than (unordered, signaling) \n
1717/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1718/// 0x07: Ordered (non-signaling) \n
1719/// 0x08: Equal (unordered, non-signaling) \n
1720/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1721/// 0x0A: Not-greater-than (unordered, signaling) \n
1722/// 0x0B: False (ordered, non-signaling) \n
1723/// 0x0C: Not-equal (ordered, non-signaling) \n
1724/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1725/// 0x0E: Greater-than (ordered, signaling) \n
1726/// 0x0F: True (unordered, non-signaling) \n
1727/// 0x10: Equal (ordered, signaling) \n
1728/// 0x11: Less-than (ordered, non-signaling) \n
1729/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1730/// 0x13: Unordered (signaling) \n
1731/// 0x14: Not-equal (unordered, signaling) \n
1732/// 0x15: Not-less-than (unordered, non-signaling) \n
1733/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1734/// 0x17: Ordered (signaling) \n
1735/// 0x18: Equal (unordered, signaling) \n
1736/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1737/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1738/// 0x1B: False (ordered, signaling) \n
1739/// 0x1C: Not-equal (ordered, signaling) \n
1740/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1741/// 0x1E: Greater-than (ordered, non-signaling) \n
1742/// 0x1F: True (unordered, signaling)
1743/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1744/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1745
1746/// Compares each of the corresponding double-precision values of two
1747/// 256-bit vectors of [4 x double], using the operation specified by the
1748/// immediate integer operand.
1749///
1750/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1751/// If either value in a comparison is NaN, comparisons that are ordered
1752/// return false, and comparisons that are unordered return true.
1753///
1754/// \headerfile <x86intrin.h>
1755///
1756/// \code
1757/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1758/// \endcode
1759///
1760/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1761///
1762/// \param a
1763/// A 256-bit vector of [4 x double].
1764/// \param b
1765/// A 256-bit vector of [4 x double].
1766/// \param c
1767/// An immediate integer operand, with bits [4:0] specifying which comparison
1768/// operation to use: \n
1769/// 0x00: Equal (ordered, non-signaling) \n
1770/// 0x01: Less-than (ordered, signaling) \n
1771/// 0x02: Less-than-or-equal (ordered, signaling) \n
1772/// 0x03: Unordered (non-signaling) \n
1773/// 0x04: Not-equal (unordered, non-signaling) \n
1774/// 0x05: Not-less-than (unordered, signaling) \n
1775/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1776/// 0x07: Ordered (non-signaling) \n
1777/// 0x08: Equal (unordered, non-signaling) \n
1778/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1779/// 0x0A: Not-greater-than (unordered, signaling) \n
1780/// 0x0B: False (ordered, non-signaling) \n
1781/// 0x0C: Not-equal (ordered, non-signaling) \n
1782/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1783/// 0x0E: Greater-than (ordered, signaling) \n
1784/// 0x0F: True (unordered, non-signaling) \n
1785/// 0x10: Equal (ordered, signaling) \n
1786/// 0x11: Less-than (ordered, non-signaling) \n
1787/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1788/// 0x13: Unordered (signaling) \n
1789/// 0x14: Not-equal (unordered, signaling) \n
1790/// 0x15: Not-less-than (unordered, non-signaling) \n
1791/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1792/// 0x17: Ordered (signaling) \n
1793/// 0x18: Equal (unordered, signaling) \n
1794/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1795/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1796/// 0x1B: False (ordered, signaling) \n
1797/// 0x1C: Not-equal (ordered, signaling) \n
1798/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1799/// 0x1E: Greater-than (ordered, non-signaling) \n
1800/// 0x1F: True (unordered, signaling)
1801/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1802#define _mm256_cmp_pd(a, b, c) \
1803 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1804 (__v4df)(__m256d)(b), (c)))
1805
1806/// Compares each of the corresponding values of two 256-bit vectors of
1807/// [8 x float], using the operation specified by the immediate integer
1808/// operand.
1809///
1810/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1811/// If either value in a comparison is NaN, comparisons that are ordered
1812/// return false, and comparisons that are unordered return true.
1813///
1814/// \headerfile <x86intrin.h>
1815///
1816/// \code
1817/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1818/// \endcode
1819///
1820/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1821///
1822/// \param a
1823/// A 256-bit vector of [8 x float].
1824/// \param b
1825/// A 256-bit vector of [8 x float].
1826/// \param c
1827/// An immediate integer operand, with bits [4:0] specifying which comparison
1828/// operation to use: \n
1829/// 0x00: Equal (ordered, non-signaling) \n
1830/// 0x01: Less-than (ordered, signaling) \n
1831/// 0x02: Less-than-or-equal (ordered, signaling) \n
1832/// 0x03: Unordered (non-signaling) \n
1833/// 0x04: Not-equal (unordered, non-signaling) \n
1834/// 0x05: Not-less-than (unordered, signaling) \n
1835/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1836/// 0x07: Ordered (non-signaling) \n
1837/// 0x08: Equal (unordered, non-signaling) \n
1838/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1839/// 0x0A: Not-greater-than (unordered, signaling) \n
1840/// 0x0B: False (ordered, non-signaling) \n
1841/// 0x0C: Not-equal (ordered, non-signaling) \n
1842/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1843/// 0x0E: Greater-than (ordered, signaling) \n
1844/// 0x0F: True (unordered, non-signaling) \n
1845/// 0x10: Equal (ordered, signaling) \n
1846/// 0x11: Less-than (ordered, non-signaling) \n
1847/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1848/// 0x13: Unordered (signaling) \n
1849/// 0x14: Not-equal (unordered, signaling) \n
1850/// 0x15: Not-less-than (unordered, non-signaling) \n
1851/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1852/// 0x17: Ordered (signaling) \n
1853/// 0x18: Equal (unordered, signaling) \n
1854/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1855/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1856/// 0x1B: False (ordered, signaling) \n
1857/// 0x1C: Not-equal (ordered, signaling) \n
1858/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1859/// 0x1E: Greater-than (ordered, non-signaling) \n
1860/// 0x1F: True (unordered, signaling)
1861/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1862#define _mm256_cmp_ps(a, b, c) \
1863 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1864 (__v8sf)(__m256)(b), (c)))
1865
1866/* Below intrinsic defined in emmintrin.h can be used for AVX */
1867/// Compares each of the corresponding scalar double-precision values of
1868/// two 128-bit vectors of [2 x double], using the operation specified by the
1869/// immediate integer operand.
1870///
1871/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1872/// If either value in a comparison is NaN, comparisons that are ordered
1873/// return false, and comparisons that are unordered return true.
1874///
1875/// \headerfile <x86intrin.h>
1876///
1877/// \code
1878/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1879/// \endcode
1880///
1881/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1882///
1883/// \param a
1884/// A 128-bit vector of [2 x double].
1885/// \param b
1886/// A 128-bit vector of [2 x double].
1887/// \param c
1888/// An immediate integer operand, with bits [4:0] specifying which comparison
1889/// operation to use: \n
1890/// 0x00: Equal (ordered, non-signaling) \n
1891/// 0x01: Less-than (ordered, signaling) \n
1892/// 0x02: Less-than-or-equal (ordered, signaling) \n
1893/// 0x03: Unordered (non-signaling) \n
1894/// 0x04: Not-equal (unordered, non-signaling) \n
1895/// 0x05: Not-less-than (unordered, signaling) \n
1896/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1897/// 0x07: Ordered (non-signaling) \n
1898/// 0x08: Equal (unordered, non-signaling) \n
1899/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1900/// 0x0A: Not-greater-than (unordered, signaling) \n
1901/// 0x0B: False (ordered, non-signaling) \n
1902/// 0x0C: Not-equal (ordered, non-signaling) \n
1903/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1904/// 0x0E: Greater-than (ordered, signaling) \n
1905/// 0x0F: True (unordered, non-signaling) \n
1906/// 0x10: Equal (ordered, signaling) \n
1907/// 0x11: Less-than (ordered, non-signaling) \n
1908/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1909/// 0x13: Unordered (signaling) \n
1910/// 0x14: Not-equal (unordered, signaling) \n
1911/// 0x15: Not-less-than (unordered, non-signaling) \n
1912/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1913/// 0x17: Ordered (signaling) \n
1914/// 0x18: Equal (unordered, signaling) \n
1915/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1916/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1917/// 0x1B: False (ordered, signaling) \n
1918/// 0x1C: Not-equal (ordered, signaling) \n
1919/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1920/// 0x1E: Greater-than (ordered, non-signaling) \n
1921/// 0x1F: True (unordered, signaling)
1922/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1923/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1924
1925/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1926/// Compares each of the corresponding scalar values of two 128-bit
1927/// vectors of [4 x float], using the operation specified by the immediate
1928/// integer operand.
1929///
1930/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1931/// If either value in a comparison is NaN, comparisons that are ordered
1932/// return false, and comparisons that are unordered return true.
1933///
1934/// \headerfile <x86intrin.h>
1935///
1936/// \code
1937/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1938/// \endcode
1939///
1940/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1941///
1942/// \param a
1943/// A 128-bit vector of [4 x float].
1944/// \param b
1945/// A 128-bit vector of [4 x float].
1946/// \param c
1947/// An immediate integer operand, with bits [4:0] specifying which comparison
1948/// operation to use: \n
1949/// 0x00: Equal (ordered, non-signaling) \n
1950/// 0x01: Less-than (ordered, signaling) \n
1951/// 0x02: Less-than-or-equal (ordered, signaling) \n
1952/// 0x03: Unordered (non-signaling) \n
1953/// 0x04: Not-equal (unordered, non-signaling) \n
1954/// 0x05: Not-less-than (unordered, signaling) \n
1955/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1956/// 0x07: Ordered (non-signaling) \n
1957/// 0x08: Equal (unordered, non-signaling) \n
1958/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1959/// 0x0A: Not-greater-than (unordered, signaling) \n
1960/// 0x0B: False (ordered, non-signaling) \n
1961/// 0x0C: Not-equal (ordered, non-signaling) \n
1962/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1963/// 0x0E: Greater-than (ordered, signaling) \n
1964/// 0x0F: True (unordered, non-signaling) \n
1965/// 0x10: Equal (ordered, signaling) \n
1966/// 0x11: Less-than (ordered, non-signaling) \n
1967/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1968/// 0x13: Unordered (signaling) \n
1969/// 0x14: Not-equal (unordered, signaling) \n
1970/// 0x15: Not-less-than (unordered, non-signaling) \n
1971/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1972/// 0x17: Ordered (signaling) \n
1973/// 0x18: Equal (unordered, signaling) \n
1974/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1975/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1976/// 0x1B: False (ordered, signaling) \n
1977/// 0x1C: Not-equal (ordered, signaling) \n
1978/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1979/// 0x1E: Greater-than (ordered, non-signaling) \n
1980/// 0x1F: True (unordered, signaling)
1981/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1982/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1983
1984/// Takes a [8 x i32] vector and returns the vector element value
1985/// indexed by the immediate constant operand.
1986///
1987/// \headerfile <x86intrin.h>
1988///
1989/// \code
1990/// int _mm256_extract_epi32(__m256i X, const int N);
1991/// \endcode
1992///
1993/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1994/// instruction.
1995///
1996/// \param X
1997/// A 256-bit vector of [8 x i32].
1998/// \param N
1999/// An immediate integer operand with bits [2:0] determining which vector
2000/// element is extracted and returned.
2001/// \returns A 32-bit integer containing the extracted 32 bits of extended
2002/// packed data.
2003#define _mm256_extract_epi32(X, N) \
2004 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
2005
2006/// Takes a [16 x i16] vector and returns the vector element value
2007/// indexed by the immediate constant operand.
2008///
2009/// \headerfile <x86intrin.h>
2010///
2011/// \code
2012/// int _mm256_extract_epi16(__m256i X, const int N);
2013/// \endcode
2014///
2015/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2016/// instruction.
2017///
2018/// \param X
2019/// A 256-bit integer vector of [16 x i16].
2020/// \param N
2021/// An immediate integer operand with bits [3:0] determining which vector
2022/// element is extracted and returned.
2023/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2024/// packed data.
2025#define _mm256_extract_epi16(X, N) \
2026 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2027 (int)(N)))
2028
2029/// Takes a [32 x i8] vector and returns the vector element value
2030/// indexed by the immediate constant operand.
2031///
2032/// \headerfile <x86intrin.h>
2033///
2034/// \code
2035/// int _mm256_extract_epi8(__m256i X, const int N);
2036/// \endcode
2037///
2038/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2039/// instruction.
2040///
2041/// \param X
2042/// A 256-bit integer vector of [32 x i8].
2043/// \param N
2044/// An immediate integer operand with bits [4:0] determining which vector
2045/// element is extracted and returned.
2046/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2047/// packed data.
2048#define _mm256_extract_epi8(X, N) \
2049 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2050 (int)(N)))
2051
2052#ifdef __x86_64__
2053/// Takes a [4 x i64] vector and returns the vector element value
2054/// indexed by the immediate constant operand.
2055///
2056/// \headerfile <x86intrin.h>
2057///
2058/// \code
2059/// long long _mm256_extract_epi64(__m256i X, const int N);
2060/// \endcode
2061///
2062/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2063/// instruction.
2064///
2065/// \param X
2066/// A 256-bit integer vector of [4 x i64].
2067/// \param N
2068/// An immediate integer operand with bits [1:0] determining which vector
2069/// element is extracted and returned.
2070/// \returns A 64-bit integer containing the extracted 64 bits of extended
2071/// packed data.
2072#define _mm256_extract_epi64(X, N) \
2073 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2074#endif
2075
2076/// Takes a [8 x i32] vector and replaces the vector element value
2077/// indexed by the immediate constant operand by a new value. Returns the
2078/// modified vector.
2079///
2080/// \headerfile <x86intrin.h>
2081///
2082/// \code
2083/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2084/// \endcode
2085///
2086/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2087/// instruction.
2088///
2089/// \param X
2090/// A vector of [8 x i32] to be used by the insert operation.
2091/// \param I
2092/// An integer value. The replacement value for the insert operation.
2093/// \param N
2094/// An immediate integer specifying the index of the vector element to be
2095/// replaced.
2096/// \returns A copy of vector \a X, after replacing its element indexed by
2097/// \a N with \a I.
2098#define _mm256_insert_epi32(X, I, N) \
2099 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2100 (int)(I), (int)(N)))
2101
2102
2103/// Takes a [16 x i16] vector and replaces the vector element value
2104/// indexed by the immediate constant operand with a new value. Returns the
2105/// modified vector.
2106///
2107/// \headerfile <x86intrin.h>
2108///
2109/// \code
2110/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2111/// \endcode
2112///
2113/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2114/// instruction.
2115///
2116/// \param X
2117/// A vector of [16 x i16] to be used by the insert operation.
2118/// \param I
2119/// An i16 integer value. The replacement value for the insert operation.
2120/// \param N
2121/// An immediate integer specifying the index of the vector element to be
2122/// replaced.
2123/// \returns A copy of vector \a X, after replacing its element indexed by
2124/// \a N with \a I.
2125#define _mm256_insert_epi16(X, I, N) \
2126 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2127 (int)(I), (int)(N)))
2128
2129/// Takes a [32 x i8] vector and replaces the vector element value
2130/// indexed by the immediate constant operand with a new value. Returns the
2131/// modified vector.
2132///
2133/// \headerfile <x86intrin.h>
2134///
2135/// \code
2136/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2137/// \endcode
2138///
2139/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2140/// instruction.
2141///
2142/// \param X
2143/// A vector of [32 x i8] to be used by the insert operation.
2144/// \param I
2145/// An i8 integer value. The replacement value for the insert operation.
2146/// \param N
2147/// An immediate integer specifying the index of the vector element to be
2148/// replaced.
2149/// \returns A copy of vector \a X, after replacing its element indexed by
2150/// \a N with \a I.
2151#define _mm256_insert_epi8(X, I, N) \
2152 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2153 (int)(I), (int)(N)))
2154
2155#ifdef __x86_64__
2156/// Takes a [4 x i64] vector and replaces the vector element value
2157/// indexed by the immediate constant operand with a new value. Returns the
2158/// modified vector.
2159///
2160/// \headerfile <x86intrin.h>
2161///
2162/// \code
2163/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2164/// \endcode
2165///
2166/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2167/// instruction.
2168///
2169/// \param X
2170/// A vector of [4 x i64] to be used by the insert operation.
2171/// \param I
2172/// A 64-bit integer value. The replacement value for the insert operation.
2173/// \param N
2174/// An immediate integer specifying the index of the vector element to be
2175/// replaced.
2176/// \returns A copy of vector \a X, after replacing its element indexed by
2177/// \a N with \a I.
2178#define _mm256_insert_epi64(X, I, N) \
2179 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2180 (long long)(I), (int)(N)))
2181#endif
2182
2183/* Conversion */
2184/// Converts a vector of [4 x i32] into a vector of [4 x double].
2185///
2186/// \headerfile <x86intrin.h>
2187///
2188/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2189///
2190/// \param __a
2191/// A 128-bit integer vector of [4 x i32].
2192/// \returns A 256-bit vector of [4 x double] containing the converted values.
2193static __inline __m256d __DEFAULT_FN_ATTRS
2195{
2196 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2197}
2198
2199/// Converts a vector of [8 x i32] into a vector of [8 x float].
2200///
2201/// \headerfile <x86intrin.h>
2202///
2203/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2204///
2205/// \param __a
2206/// A 256-bit integer vector.
2207/// \returns A 256-bit vector of [8 x float] containing the converted values.
2208static __inline __m256 __DEFAULT_FN_ATTRS
2210{
2211 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2212}
2213
2214/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2215/// [4 x float].
2216///
2217/// \headerfile <x86intrin.h>
2218///
2219/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2220///
2221/// \param __a
2222/// A 256-bit vector of [4 x double].
2223/// \returns A 128-bit vector of [4 x float] containing the converted values.
2224static __inline __m128 __DEFAULT_FN_ATTRS
2226{
2227 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2228}
2229
2230/// Converts a vector of [8 x float] into a vector of [8 x i32].
2231///
2232/// If a converted value does not fit in a 32-bit integer, raises a
2233/// floating-point invalid exception. If the exception is masked, returns
2234/// the most negative integer.
2235///
2236/// \headerfile <x86intrin.h>
2237///
2238/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2239///
2240/// \param __a
2241/// A 256-bit vector of [8 x float].
2242/// \returns A 256-bit integer vector containing the converted values.
2243static __inline __m256i __DEFAULT_FN_ATTRS
2245{
2246 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2247}
2248
2249/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2250/// x double].
2251///
2252/// \headerfile <x86intrin.h>
2253///
2254/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2255///
2256/// \param __a
2257/// A 128-bit vector of [4 x float].
2258/// \returns A 256-bit vector of [4 x double] containing the converted values.
2259static __inline __m256d __DEFAULT_FN_ATTRS
2261{
2262 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2263}
2264
2265/// Converts a 256-bit vector of [4 x double] into four signed truncated
2266/// (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2267/// [4 x i32].
2268///
2269/// If a converted value does not fit in a 32-bit integer, raises a
2270/// floating-point invalid exception. If the exception is masked, returns
2271/// the most negative integer.
2272///
2273/// \headerfile <x86intrin.h>
2274///
2275/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2276///
2277/// \param __a
2278/// A 256-bit vector of [4 x double].
2279/// \returns A 128-bit integer vector containing the converted values.
2280static __inline __m128i __DEFAULT_FN_ATTRS
2282{
2283 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2284}
2285
2286/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2287/// [4 x i32].
2288///
2289/// If a converted value does not fit in a 32-bit integer, raises a
2290/// floating-point invalid exception. If the exception is masked, returns
2291/// the most negative integer.
2292///
2293/// \headerfile <x86intrin.h>
2294///
2295/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2296///
2297/// \param __a
2298/// A 256-bit vector of [4 x double].
2299/// \returns A 128-bit integer vector containing the converted values.
2300static __inline __m128i __DEFAULT_FN_ATTRS
2302{
2303 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2304}
2305
2306/// Converts a vector of [8 x float] into eight signed truncated (rounded
2307/// toward zero) 32-bit integers returned in a vector of [8 x i32].
2308///
2309/// If a converted value does not fit in a 32-bit integer, raises a
2310/// floating-point invalid exception. If the exception is masked, returns
2311/// the most negative integer.
2312///
2313/// \headerfile <x86intrin.h>
2314///
2315/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2316///
2317/// \param __a
2318/// A 256-bit vector of [8 x float].
2319/// \returns A 256-bit integer vector containing the converted values.
2320static __inline __m256i __DEFAULT_FN_ATTRS
2322{
2323 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2324}
2325
2326/// Returns the first element of the input vector of [4 x double].
2327///
2328/// \headerfile <x86intrin.h>
2329///
2330/// This intrinsic is a utility function and does not correspond to a specific
2331/// instruction.
2332///
2333/// \param __a
2334/// A 256-bit vector of [4 x double].
2335/// \returns A 64 bit double containing the first element of the input vector.
2336static __inline double __DEFAULT_FN_ATTRS
2338{
2339 return __a[0];
2340}
2341
2342/// Returns the first element of the input vector of [8 x i32].
2343///
2344/// \headerfile <x86intrin.h>
2345///
2346/// This intrinsic is a utility function and does not correspond to a specific
2347/// instruction.
2348///
2349/// \param __a
2350/// A 256-bit vector of [8 x i32].
2351/// \returns A 32 bit integer containing the first element of the input vector.
2352static __inline int __DEFAULT_FN_ATTRS
2354{
2355 __v8si __b = (__v8si)__a;
2356 return __b[0];
2357}
2358
2359/// Returns the first element of the input vector of [8 x float].
2360///
2361/// \headerfile <x86intrin.h>
2362///
2363/// This intrinsic is a utility function and does not correspond to a specific
2364/// instruction.
2365///
2366/// \param __a
2367/// A 256-bit vector of [8 x float].
2368/// \returns A 32 bit float containing the first element of the input vector.
2369static __inline float __DEFAULT_FN_ATTRS
2371{
2372 return __a[0];
2373}
2374
2375/* Vector replicate */
2376/// Moves and duplicates odd-indexed values from a 256-bit vector of
2377/// [8 x float] to float values in a 256-bit vector of [8 x float].
2378///
2379/// \headerfile <x86intrin.h>
2380///
2381/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2382///
2383/// \param __a
2384/// A 256-bit vector of [8 x float]. \n
2385/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2386/// the return value. \n
2387/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2388/// the return value. \n
2389/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2390/// return value. \n
2391/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2392/// return value.
2393/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2394/// values.
2395static __inline __m256 __DEFAULT_FN_ATTRS
2397{
2398 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2399}
2400
2401/// Moves and duplicates even-indexed values from a 256-bit vector of
2402/// [8 x float] to float values in a 256-bit vector of [8 x float].
2403///
2404/// \headerfile <x86intrin.h>
2405///
2406/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2407///
2408/// \param __a
2409/// A 256-bit vector of [8 x float]. \n
2410/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2411/// the return value. \n
2412/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2413/// the return value. \n
2414/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2415/// return value. \n
2416/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2417/// return value.
2418/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2419/// values.
2420static __inline __m256 __DEFAULT_FN_ATTRS
2422{
2423 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2424}
2425
2426/// Moves and duplicates double-precision floating point values from a
2427/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2428/// vector of [4 x double].
2429///
2430/// \headerfile <x86intrin.h>
2431///
2432/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2433///
2434/// \param __a
2435/// A 256-bit vector of [4 x double]. \n
2436/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2437/// return value. \n
2438/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2439/// the return value.
2440/// \returns A 256-bit vector of [4 x double] containing the moved and
2441/// duplicated values.
2442static __inline __m256d __DEFAULT_FN_ATTRS
2444{
2445 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2446}
2447
2448/* Unpack and Interleave */
2449/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2450/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2451///
2452/// \headerfile <x86intrin.h>
2453///
2454/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2455///
2456/// \param __a
2457/// A 256-bit floating-point vector of [4 x double]. \n
2458/// Bits [127:64] are written to bits [63:0] of the return value. \n
2459/// Bits [255:192] are written to bits [191:128] of the return value. \n
2460/// \param __b
2461/// A 256-bit floating-point vector of [4 x double]. \n
2462/// Bits [127:64] are written to bits [127:64] of the return value. \n
2463/// Bits [255:192] are written to bits [255:192] of the return value. \n
2464/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2465static __inline __m256d __DEFAULT_FN_ATTRS
2466_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2467{
2468 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2469}
2470
2471/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2472/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2473///
2474/// \headerfile <x86intrin.h>
2475///
2476/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2477///
2478/// \param __a
2479/// A 256-bit floating-point vector of [4 x double]. \n
2480/// Bits [63:0] are written to bits [63:0] of the return value. \n
2481/// Bits [191:128] are written to bits [191:128] of the return value.
2482/// \param __b
2483/// A 256-bit floating-point vector of [4 x double]. \n
2484/// Bits [63:0] are written to bits [127:64] of the return value. \n
2485/// Bits [191:128] are written to bits [255:192] of the return value. \n
2486/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2487static __inline __m256d __DEFAULT_FN_ATTRS
2488_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2489{
2490 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2491}
2492
2493/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2494/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2495/// vector of [8 x float].
2496///
2497/// \headerfile <x86intrin.h>
2498///
2499/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2500///
2501/// \param __a
2502/// A 256-bit vector of [8 x float]. \n
2503/// Bits [95:64] are written to bits [31:0] of the return value. \n
2504/// Bits [127:96] are written to bits [95:64] of the return value. \n
2505/// Bits [223:192] are written to bits [159:128] of the return value. \n
2506/// Bits [255:224] are written to bits [223:192] of the return value.
2507/// \param __b
2508/// A 256-bit vector of [8 x float]. \n
2509/// Bits [95:64] are written to bits [63:32] of the return value. \n
2510/// Bits [127:96] are written to bits [127:96] of the return value. \n
2511/// Bits [223:192] are written to bits [191:160] of the return value. \n
2512/// Bits [255:224] are written to bits [255:224] of the return value.
2513/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2514static __inline __m256 __DEFAULT_FN_ATTRS
2516{
2517 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2518}
2519
2520/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2521/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2522/// vector of [8 x float].
2523///
2524/// \headerfile <x86intrin.h>
2525///
2526/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2527///
2528/// \param __a
2529/// A 256-bit vector of [8 x float]. \n
2530/// Bits [31:0] are written to bits [31:0] of the return value. \n
2531/// Bits [63:32] are written to bits [95:64] of the return value. \n
2532/// Bits [159:128] are written to bits [159:128] of the return value. \n
2533/// Bits [191:160] are written to bits [223:192] of the return value.
2534/// \param __b
2535/// A 256-bit vector of [8 x float]. \n
2536/// Bits [31:0] are written to bits [63:32] of the return value. \n
2537/// Bits [63:32] are written to bits [127:96] of the return value. \n
2538/// Bits [159:128] are written to bits [191:160] of the return value. \n
2539/// Bits [191:160] are written to bits [255:224] of the return value.
2540/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2541static __inline __m256 __DEFAULT_FN_ATTRS
2543{
2544 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2545}
2546
2547/* Bit Test */
2548/// Given two 128-bit floating-point vectors of [2 x double], perform an
2549/// element-by-element comparison of the double-precision element in the
2550/// first source vector and the corresponding element in the second source
2551/// vector.
2552///
2553/// The EFLAGS register is updated as follows: \n
2554/// If there is at least one pair of double-precision elements where the
2555/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2556/// ZF flag is set to 1. \n
2557/// If there is at least one pair of double-precision elements where the
2558/// sign-bit of the first element is 0 and the sign-bit of the second element
2559/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2560/// This intrinsic returns the value of the ZF flag.
2561///
2562/// \headerfile <x86intrin.h>
2563///
2564/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2565///
2566/// \param __a
2567/// A 128-bit vector of [2 x double].
2568/// \param __b
2569/// A 128-bit vector of [2 x double].
2570/// \returns the ZF flag in the EFLAGS register.
2571static __inline int __DEFAULT_FN_ATTRS128
2572_mm_testz_pd(__m128d __a, __m128d __b)
2573{
2574 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2575}
2576
2577/// Given two 128-bit floating-point vectors of [2 x double], perform an
2578/// element-by-element comparison of the double-precision element in the
2579/// first source vector and the corresponding element in the second source
2580/// vector.
2581///
2582/// The EFLAGS register is updated as follows: \n
2583/// If there is at least one pair of double-precision elements where the
2584/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2585/// ZF flag is set to 1. \n
2586/// If there is at least one pair of double-precision elements where the
2587/// sign-bit of the first element is 0 and the sign-bit of the second element
2588/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2589/// This intrinsic returns the value of the CF flag.
2590///
2591/// \headerfile <x86intrin.h>
2592///
2593/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2594///
2595/// \param __a
2596/// A 128-bit vector of [2 x double].
2597/// \param __b
2598/// A 128-bit vector of [2 x double].
2599/// \returns the CF flag in the EFLAGS register.
2600static __inline int __DEFAULT_FN_ATTRS128
2601_mm_testc_pd(__m128d __a, __m128d __b)
2602{
2603 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2604}
2605
2606/// Given two 128-bit floating-point vectors of [2 x double], perform an
2607/// element-by-element comparison of the double-precision element in the
2608/// first source vector and the corresponding element in the second source
2609/// vector.
2610///
2611/// The EFLAGS register is updated as follows: \n
2612/// If there is at least one pair of double-precision elements where the
2613/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2614/// ZF flag is set to 1. \n
2615/// If there is at least one pair of double-precision elements where the
2616/// sign-bit of the first element is 0 and the sign-bit of the second element
2617/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2618/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2619/// otherwise it returns 0.
2620///
2621/// \headerfile <x86intrin.h>
2622///
2623/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2624///
2625/// \param __a
2626/// A 128-bit vector of [2 x double].
2627/// \param __b
2628/// A 128-bit vector of [2 x double].
2629/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2630static __inline int __DEFAULT_FN_ATTRS128
2631_mm_testnzc_pd(__m128d __a, __m128d __b)
2632{
2633 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2634}
2635
2636/// Given two 128-bit floating-point vectors of [4 x float], perform an
2637/// element-by-element comparison of the single-precision element in the
2638/// first source vector and the corresponding element in the second source
2639/// vector.
2640///
2641/// The EFLAGS register is updated as follows: \n
2642/// If there is at least one pair of single-precision elements where the
2643/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2644/// ZF flag is set to 1. \n
2645/// If there is at least one pair of single-precision elements where the
2646/// sign-bit of the first element is 0 and the sign-bit of the second element
2647/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2648/// This intrinsic returns the value of the ZF flag.
2649///
2650/// \headerfile <x86intrin.h>
2651///
2652/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2653///
2654/// \param __a
2655/// A 128-bit vector of [4 x float].
2656/// \param __b
2657/// A 128-bit vector of [4 x float].
2658/// \returns the ZF flag.
2659static __inline int __DEFAULT_FN_ATTRS128
2660_mm_testz_ps(__m128 __a, __m128 __b)
2661{
2662 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2663}
2664
2665/// Given two 128-bit floating-point vectors of [4 x float], perform an
2666/// element-by-element comparison of the single-precision element in the
2667/// first source vector and the corresponding element in the second source
2668/// vector.
2669///
2670/// The EFLAGS register is updated as follows: \n
2671/// If there is at least one pair of single-precision elements where the
2672/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2673/// ZF flag is set to 1. \n
2674/// If there is at least one pair of single-precision elements where the
2675/// sign-bit of the first element is 0 and the sign-bit of the second element
2676/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2677/// This intrinsic returns the value of the CF flag.
2678///
2679/// \headerfile <x86intrin.h>
2680///
2681/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2682///
2683/// \param __a
2684/// A 128-bit vector of [4 x float].
2685/// \param __b
2686/// A 128-bit vector of [4 x float].
2687/// \returns the CF flag.
2688static __inline int __DEFAULT_FN_ATTRS128
2689_mm_testc_ps(__m128 __a, __m128 __b)
2690{
2691 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2692}
2693
2694/// Given two 128-bit floating-point vectors of [4 x float], perform an
2695/// element-by-element comparison of the single-precision element in the
2696/// first source vector and the corresponding element in the second source
2697/// vector.
2698///
2699/// The EFLAGS register is updated as follows: \n
2700/// If there is at least one pair of single-precision elements where the
2701/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2702/// ZF flag is set to 1. \n
2703/// If there is at least one pair of single-precision elements where the
2704/// sign-bit of the first element is 0 and the sign-bit of the second element
2705/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2706/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2707/// otherwise it returns 0.
2708///
2709/// \headerfile <x86intrin.h>
2710///
2711/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2712///
2713/// \param __a
2714/// A 128-bit vector of [4 x float].
2715/// \param __b
2716/// A 128-bit vector of [4 x float].
2717/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2718static __inline int __DEFAULT_FN_ATTRS128
2719_mm_testnzc_ps(__m128 __a, __m128 __b)
2720{
2721 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2722}
2723
2724/// Given two 256-bit floating-point vectors of [4 x double], perform an
2725/// element-by-element comparison of the double-precision elements in the
2726/// first source vector and the corresponding elements in the second source
2727/// vector.
2728///
2729/// The EFLAGS register is updated as follows: \n
2730/// If there is at least one pair of double-precision elements where the
2731/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2732/// ZF flag is set to 1. \n
2733/// If there is at least one pair of double-precision elements where the
2734/// sign-bit of the first element is 0 and the sign-bit of the second element
2735/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2736/// This intrinsic returns the value of the ZF flag.
2737///
2738/// \headerfile <x86intrin.h>
2739///
2740/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2741///
2742/// \param __a
2743/// A 256-bit vector of [4 x double].
2744/// \param __b
2745/// A 256-bit vector of [4 x double].
2746/// \returns the ZF flag.
2747static __inline int __DEFAULT_FN_ATTRS
2748_mm256_testz_pd(__m256d __a, __m256d __b)
2749{
2750 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2751}
2752
2753/// Given two 256-bit floating-point vectors of [4 x double], perform an
2754/// element-by-element comparison of the double-precision elements in the
2755/// first source vector and the corresponding elements in the second source
2756/// vector.
2757///
2758/// The EFLAGS register is updated as follows: \n
2759/// If there is at least one pair of double-precision elements where the
2760/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2761/// ZF flag is set to 1. \n
2762/// If there is at least one pair of double-precision elements where the
2763/// sign-bit of the first element is 0 and the sign-bit of the second element
2764/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2765/// This intrinsic returns the value of the CF flag.
2766///
2767/// \headerfile <x86intrin.h>
2768///
2769/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2770///
2771/// \param __a
2772/// A 256-bit vector of [4 x double].
2773/// \param __b
2774/// A 256-bit vector of [4 x double].
2775/// \returns the CF flag.
2776static __inline int __DEFAULT_FN_ATTRS
2777_mm256_testc_pd(__m256d __a, __m256d __b)
2778{
2779 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2780}
2781
2782/// Given two 256-bit floating-point vectors of [4 x double], perform an
2783/// element-by-element comparison of the double-precision elements in the
2784/// first source vector and the corresponding elements in the second source
2785/// vector.
2786///
2787/// The EFLAGS register is updated as follows: \n
2788/// If there is at least one pair of double-precision elements where the
2789/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2790/// ZF flag is set to 1. \n
2791/// If there is at least one pair of double-precision elements where the
2792/// sign-bit of the first element is 0 and the sign-bit of the second element
2793/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2794/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2795/// otherwise it returns 0.
2796///
2797/// \headerfile <x86intrin.h>
2798///
2799/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2800///
2801/// \param __a
2802/// A 256-bit vector of [4 x double].
2803/// \param __b
2804/// A 256-bit vector of [4 x double].
2805/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2806static __inline int __DEFAULT_FN_ATTRS
2807_mm256_testnzc_pd(__m256d __a, __m256d __b)
2808{
2809 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2810}
2811
2812/// Given two 256-bit floating-point vectors of [8 x float], perform an
2813/// element-by-element comparison of the single-precision element in the
2814/// first source vector and the corresponding element in the second source
2815/// vector.
2816///
2817/// The EFLAGS register is updated as follows: \n
2818/// If there is at least one pair of single-precision elements where the
2819/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2820/// ZF flag is set to 1. \n
2821/// If there is at least one pair of single-precision elements where the
2822/// sign-bit of the first element is 0 and the sign-bit of the second element
2823/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2824/// This intrinsic returns the value of the ZF flag.
2825///
2826/// \headerfile <x86intrin.h>
2827///
2828/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2829///
2830/// \param __a
2831/// A 256-bit vector of [8 x float].
2832/// \param __b
2833/// A 256-bit vector of [8 x float].
2834/// \returns the ZF flag.
2835static __inline int __DEFAULT_FN_ATTRS
2836_mm256_testz_ps(__m256 __a, __m256 __b)
2837{
2838 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2839}
2840
2841/// Given two 256-bit floating-point vectors of [8 x float], perform an
2842/// element-by-element comparison of the single-precision element in the
2843/// first source vector and the corresponding element in the second source
2844/// vector.
2845///
2846/// The EFLAGS register is updated as follows: \n
2847/// If there is at least one pair of single-precision elements where the
2848/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2849/// ZF flag is set to 1. \n
2850/// If there is at least one pair of single-precision elements where the
2851/// sign-bit of the first element is 0 and the sign-bit of the second element
2852/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2853/// This intrinsic returns the value of the CF flag.
2854///
2855/// \headerfile <x86intrin.h>
2856///
2857/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2858///
2859/// \param __a
2860/// A 256-bit vector of [8 x float].
2861/// \param __b
2862/// A 256-bit vector of [8 x float].
2863/// \returns the CF flag.
2864static __inline int __DEFAULT_FN_ATTRS
2865_mm256_testc_ps(__m256 __a, __m256 __b)
2866{
2867 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2868}
2869
2870/// Given two 256-bit floating-point vectors of [8 x float], perform an
2871/// element-by-element comparison of the single-precision elements in the
2872/// first source vector and the corresponding elements in the second source
2873/// vector.
2874///
2875/// The EFLAGS register is updated as follows: \n
2876/// If there is at least one pair of single-precision elements where the
2877/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2878/// ZF flag is set to 1. \n
2879/// If there is at least one pair of single-precision elements where the
2880/// sign-bit of the first element is 0 and the sign-bit of the second element
2881/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2882/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2883/// otherwise it returns 0.
2884///
2885/// \headerfile <x86intrin.h>
2886///
2887/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2888///
2889/// \param __a
2890/// A 256-bit vector of [8 x float].
2891/// \param __b
2892/// A 256-bit vector of [8 x float].
2893/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2894static __inline int __DEFAULT_FN_ATTRS
2896{
2897 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2898}
2899
2900/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2901/// of the two source vectors.
2902///
2903/// The EFLAGS register is updated as follows: \n
2904/// If there is at least one pair of bits where both bits are 1, the ZF flag
2905/// is set to 0. Otherwise the ZF flag is set to 1. \n
2906/// If there is at least one pair of bits where the bit from the first source
2907/// vector is 0 and the bit from the second source vector is 1, the CF flag
2908/// is set to 0. Otherwise the CF flag is set to 1. \n
2909/// This intrinsic returns the value of the ZF flag.
2910///
2911/// \headerfile <x86intrin.h>
2912///
2913/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2914///
2915/// \param __a
2916/// A 256-bit integer vector.
2917/// \param __b
2918/// A 256-bit integer vector.
2919/// \returns the ZF flag.
2920static __inline int __DEFAULT_FN_ATTRS
2921_mm256_testz_si256(__m256i __a, __m256i __b)
2922{
2923 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2924}
2925
2926/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2927/// of the two source vectors.
2928///
2929/// The EFLAGS register is updated as follows: \n
2930/// If there is at least one pair of bits where both bits are 1, the ZF flag
2931/// is set to 0. Otherwise the ZF flag is set to 1. \n
2932/// If there is at least one pair of bits where the bit from the first source
2933/// vector is 0 and the bit from the second source vector is 1, the CF flag
2934/// is set to 0. Otherwise the CF flag is set to 1. \n
2935/// This intrinsic returns the value of the CF flag.
2936///
2937/// \headerfile <x86intrin.h>
2938///
2939/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2940///
2941/// \param __a
2942/// A 256-bit integer vector.
2943/// \param __b
2944/// A 256-bit integer vector.
2945/// \returns the CF flag.
2946static __inline int __DEFAULT_FN_ATTRS
2947_mm256_testc_si256(__m256i __a, __m256i __b)
2948{
2949 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2950}
2951
2952/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2953/// of the two source vectors.
2954///
2955/// The EFLAGS register is updated as follows: \n
2956/// If there is at least one pair of bits where both bits are 1, the ZF flag
2957/// is set to 0. Otherwise the ZF flag is set to 1. \n
2958/// If there is at least one pair of bits where the bit from the first source
2959/// vector is 0 and the bit from the second source vector is 1, the CF flag
2960/// is set to 0. Otherwise the CF flag is set to 1. \n
2961/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2962/// otherwise it returns 0.
2963///
2964/// \headerfile <x86intrin.h>
2965///
2966/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2967///
2968/// \param __a
2969/// A 256-bit integer vector.
2970/// \param __b
2971/// A 256-bit integer vector.
2972/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2973static __inline int __DEFAULT_FN_ATTRS
2975{
2976 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2977}
2978
2979/* Vector extract sign mask */
2980/// Extracts the sign bits of double-precision floating point elements
2981/// in a 256-bit vector of [4 x double] and writes them to the lower order
2982/// bits of the return value.
2983///
2984/// \headerfile <x86intrin.h>
2985///
2986/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2987///
2988/// \param __a
2989/// A 256-bit vector of [4 x double] containing the double-precision
2990/// floating point values with sign bits to be extracted.
2991/// \returns The sign bits from the operand, written to bits [3:0].
2992static __inline int __DEFAULT_FN_ATTRS
2994{
2995 return __builtin_ia32_movmskpd256((__v4df)__a);
2996}
2997
2998/// Extracts the sign bits of single-precision floating point elements
2999/// in a 256-bit vector of [8 x float] and writes them to the lower order
3000/// bits of the return value.
3001///
3002/// \headerfile <x86intrin.h>
3003///
3004/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
3005///
3006/// \param __a
3007/// A 256-bit vector of [8 x float] containing the single-precision floating
3008/// point values with sign bits to be extracted.
3009/// \returns The sign bits from the operand, written to bits [7:0].
3010static __inline int __DEFAULT_FN_ATTRS
3012{
3013 return __builtin_ia32_movmskps256((__v8sf)__a);
3014}
3015
3016/* Vector __zero */
3017/// Zeroes the contents of all XMM or YMM registers.
3018///
3019/// \headerfile <x86intrin.h>
3020///
3021/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
3022static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3023_mm256_zeroall(void)
3024{
3025 __builtin_ia32_vzeroall();
3026}
3027
3028/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
3029///
3030/// \headerfile <x86intrin.h>
3031///
3032/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3033static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3034_mm256_zeroupper(void)
3035{
3036 __builtin_ia32_vzeroupper();
3037}
3038
3039/* Vector load with broadcast */
3040/// Loads a scalar single-precision floating point value from the
3041/// specified address pointed to by \a __a and broadcasts it to the elements
3042/// of a [4 x float] vector.
3043///
3044/// \headerfile <x86intrin.h>
3045///
3046/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3047///
3048/// \param __a
3049/// The single-precision floating point value to be broadcast.
3050/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3051/// equal to the broadcast value.
3052static __inline __m128 __DEFAULT_FN_ATTRS128
3054{
3055 struct __mm_broadcast_ss_struct {
3056 float __f;
3057 } __attribute__((__packed__, __may_alias__));
3058 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3059 return __extension__ (__m128){ __f, __f, __f, __f };
3060}
3061
3062/// Loads a scalar double-precision floating point value from the
3063/// specified address pointed to by \a __a and broadcasts it to the elements
3064/// of a [4 x double] vector.
3065///
3066/// \headerfile <x86intrin.h>
3067///
3068/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3069///
3070/// \param __a
3071/// The double-precision floating point value to be broadcast.
3072/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3073/// equal to the broadcast value.
3074static __inline __m256d __DEFAULT_FN_ATTRS
3076{
3077 struct __mm256_broadcast_sd_struct {
3078 double __d;
3079 } __attribute__((__packed__, __may_alias__));
3080 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3081 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3082}
3083
3084/// Loads a scalar single-precision floating point value from the
3085/// specified address pointed to by \a __a and broadcasts it to the elements
3086/// of a [8 x float] vector.
3087///
3088/// \headerfile <x86intrin.h>
3089///
3090/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3091///
3092/// \param __a
3093/// The single-precision floating point value to be broadcast.
3094/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3095/// equal to the broadcast value.
3096static __inline __m256 __DEFAULT_FN_ATTRS
3098{
3099 struct __mm256_broadcast_ss_struct {
3100 float __f;
3101 } __attribute__((__packed__, __may_alias__));
3102 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3103 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3104}
3105
3106/// Loads the data from a 128-bit vector of [2 x double] from the
3107/// specified address pointed to by \a __a and broadcasts it to 128-bit
3108/// elements in a 256-bit vector of [4 x double].
3109///
3110/// \headerfile <x86intrin.h>
3111///
3112/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3113///
3114/// \param __a
3115/// The 128-bit vector of [2 x double] to be broadcast.
3116/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3117/// equal to the broadcast value.
3118static __inline __m256d __DEFAULT_FN_ATTRS
3120{
3121 __m128d __b = _mm_loadu_pd((const double *)__a);
3122 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3123 0, 1, 0, 1);
3124}
3125
3126/// Loads the data from a 128-bit vector of [4 x float] from the
3127/// specified address pointed to by \a __a and broadcasts it to 128-bit
3128/// elements in a 256-bit vector of [8 x float].
3129///
3130/// \headerfile <x86intrin.h>
3131///
3132/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3133///
3134/// \param __a
3135/// The 128-bit vector of [4 x float] to be broadcast.
3136/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3137/// equal to the broadcast value.
3138static __inline __m256 __DEFAULT_FN_ATTRS
3140{
3141 __m128 __b = _mm_loadu_ps((const float *)__a);
3142 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3143 0, 1, 2, 3, 0, 1, 2, 3);
3144}
3145
3146/* SIMD load ops */
3147/// Loads 4 double-precision floating point values from a 32-byte aligned
3148/// memory location pointed to by \a __p into a vector of [4 x double].
3149///
3150/// \headerfile <x86intrin.h>
3151///
3152/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3153///
3154/// \param __p
3155/// A 32-byte aligned pointer to a memory location containing
3156/// double-precision floating point values.
3157/// \returns A 256-bit vector of [4 x double] containing the moved values.
3158static __inline __m256d __DEFAULT_FN_ATTRS
3159_mm256_load_pd(double const *__p)
3160{
3161 return *(const __m256d *)__p;
3162}
3163
3164/// Loads 8 single-precision floating point values from a 32-byte aligned
3165/// memory location pointed to by \a __p into a vector of [8 x float].
3166///
3167/// \headerfile <x86intrin.h>
3168///
3169/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3170///
3171/// \param __p
3172/// A 32-byte aligned pointer to a memory location containing float values.
3173/// \returns A 256-bit vector of [8 x float] containing the moved values.
3174static __inline __m256 __DEFAULT_FN_ATTRS
3175_mm256_load_ps(float const *__p)
3176{
3177 return *(const __m256 *)__p;
3178}
3179
3180/// Loads 4 double-precision floating point values from an unaligned
3181/// memory location pointed to by \a __p into a vector of [4 x double].
3182///
3183/// \headerfile <x86intrin.h>
3184///
3185/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3186///
3187/// \param __p
3188/// A pointer to a memory location containing double-precision floating
3189/// point values.
3190/// \returns A 256-bit vector of [4 x double] containing the moved values.
3191static __inline __m256d __DEFAULT_FN_ATTRS
3192_mm256_loadu_pd(double const *__p)
3193{
3194 struct __loadu_pd {
3195 __m256d_u __v;
3196 } __attribute__((__packed__, __may_alias__));
3197 return ((const struct __loadu_pd*)__p)->__v;
3198}
3199
3200/// Loads 8 single-precision floating point values from an unaligned
3201/// memory location pointed to by \a __p into a vector of [8 x float].
3202///
3203/// \headerfile <x86intrin.h>
3204///
3205/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3206///
3207/// \param __p
3208/// A pointer to a memory location containing single-precision floating
3209/// point values.
3210/// \returns A 256-bit vector of [8 x float] containing the moved values.
3211static __inline __m256 __DEFAULT_FN_ATTRS
3213{
3214 struct __loadu_ps {
3215 __m256_u __v;
3216 } __attribute__((__packed__, __may_alias__));
3217 return ((const struct __loadu_ps*)__p)->__v;
3218}
3219
3220/// Loads 256 bits of integer data from a 32-byte aligned memory
3221/// location pointed to by \a __p into elements of a 256-bit integer vector.
3222///
3223/// \headerfile <x86intrin.h>
3224///
3225/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3226///
3227/// \param __p
3228/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3229/// values.
3230/// \returns A 256-bit integer vector containing the moved values.
3231static __inline __m256i __DEFAULT_FN_ATTRS
3232_mm256_load_si256(__m256i const *__p)
3233{
3234 return *__p;
3235}
3236
3237/// Loads 256 bits of integer data from an unaligned memory location
3238/// pointed to by \a __p into a 256-bit integer vector.
3239///
3240/// \headerfile <x86intrin.h>
3241///
3242/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3243///
3244/// \param __p
3245/// A pointer to a 256-bit integer vector containing integer values.
3246/// \returns A 256-bit integer vector containing the moved values.
3247static __inline __m256i __DEFAULT_FN_ATTRS
3248_mm256_loadu_si256(__m256i_u const *__p)
3249{
3250 struct __loadu_si256 {
3251 __m256i_u __v;
3252 } __attribute__((__packed__, __may_alias__));
3253 return ((const struct __loadu_si256*)__p)->__v;
3254}
3255
3256/// Loads 256 bits of integer data from an unaligned memory location
3257/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3258/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3259/// line boundary.
3260///
3261/// \headerfile <x86intrin.h>
3262///
3263/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3264///
3265/// \param __p
3266/// A pointer to a 256-bit integer vector containing integer values.
3267/// \returns A 256-bit integer vector containing the moved values.
3268static __inline __m256i __DEFAULT_FN_ATTRS
3269_mm256_lddqu_si256(__m256i_u const *__p)
3270{
3271 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3272}
3273
3274/* SIMD store ops */
3275/// Stores double-precision floating point values from a 256-bit vector
3276/// of [4 x double] to a 32-byte aligned memory location pointed to by
3277/// \a __p.
3278///
3279/// \headerfile <x86intrin.h>
3280///
3281/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3282///
3283/// \param __p
3284/// A 32-byte aligned pointer to a memory location that will receive the
3285/// double-precision floaing point values.
3286/// \param __a
3287/// A 256-bit vector of [4 x double] containing the values to be moved.
3288static __inline void __DEFAULT_FN_ATTRS
3289_mm256_store_pd(double *__p, __m256d __a)
3290{
3291 *(__m256d *)__p = __a;
3292}
3293
3294/// Stores single-precision floating point values from a 256-bit vector
3295/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3296///
3297/// \headerfile <x86intrin.h>
3298///
3299/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3300///
3301/// \param __p
3302/// A 32-byte aligned pointer to a memory location that will receive the
3303/// float values.
3304/// \param __a
3305/// A 256-bit vector of [8 x float] containing the values to be moved.
3306static __inline void __DEFAULT_FN_ATTRS
3307_mm256_store_ps(float *__p, __m256 __a)
3308{
3309 *(__m256 *)__p = __a;
3310}
3311
3312/// Stores double-precision floating point values from a 256-bit vector
3313/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3314///
3315/// \headerfile <x86intrin.h>
3316///
3317/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3318///
3319/// \param __p
3320/// A pointer to a memory location that will receive the double-precision
3321/// floating point values.
3322/// \param __a
3323/// A 256-bit vector of [4 x double] containing the values to be moved.
3324static __inline void __DEFAULT_FN_ATTRS
3325_mm256_storeu_pd(double *__p, __m256d __a)
3326{
3327 struct __storeu_pd {
3328 __m256d_u __v;
3329 } __attribute__((__packed__, __may_alias__));
3330 ((struct __storeu_pd*)__p)->__v = __a;
3331}
3332
3333/// Stores single-precision floating point values from a 256-bit vector
3334/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3335///
3336/// \headerfile <x86intrin.h>
3337///
3338/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3339///
3340/// \param __p
3341/// A pointer to a memory location that will receive the float values.
3342/// \param __a
3343/// A 256-bit vector of [8 x float] containing the values to be moved.
3344static __inline void __DEFAULT_FN_ATTRS
3345_mm256_storeu_ps(float *__p, __m256 __a)
3346{
3347 struct __storeu_ps {
3348 __m256_u __v;
3349 } __attribute__((__packed__, __may_alias__));
3350 ((struct __storeu_ps*)__p)->__v = __a;
3351}
3352
3353/// Stores integer values from a 256-bit integer vector to a 32-byte
3354/// aligned memory location pointed to by \a __p.
3355///
3356/// \headerfile <x86intrin.h>
3357///
3358/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3359///
3360/// \param __p
3361/// A 32-byte aligned pointer to a memory location that will receive the
3362/// integer values.
3363/// \param __a
3364/// A 256-bit integer vector containing the values to be moved.
3365static __inline void __DEFAULT_FN_ATTRS
3366_mm256_store_si256(__m256i *__p, __m256i __a)
3367{
3368 *__p = __a;
3369}
3370
3371/// Stores integer values from a 256-bit integer vector to an unaligned
3372/// memory location pointed to by \a __p.
3373///
3374/// \headerfile <x86intrin.h>
3375///
3376/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3377///
3378/// \param __p
3379/// A pointer to a memory location that will receive the integer values.
3380/// \param __a
3381/// A 256-bit integer vector containing the values to be moved.
3382static __inline void __DEFAULT_FN_ATTRS
3383_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3384{
3385 struct __storeu_si256 {
3386 __m256i_u __v;
3387 } __attribute__((__packed__, __may_alias__));
3388 ((struct __storeu_si256*)__p)->__v = __a;
3389}
3390
3391/* Conditional load ops */
3392/// Conditionally loads double-precision floating point elements from a
3393/// memory location pointed to by \a __p into a 128-bit vector of
3394/// [2 x double], depending on the mask bits associated with each data
3395/// element.
3396///
3397/// \headerfile <x86intrin.h>
3398///
3399/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3400///
3401/// \param __p
3402/// A pointer to a memory location that contains the double-precision
3403/// floating point values.
3404/// \param __m
3405/// A 128-bit integer vector containing the mask. The most significant bit of
3406/// each data element represents the mask bits. If a mask bit is zero, the
3407/// corresponding value in the memory location is not loaded and the
3408/// corresponding field in the return value is set to zero.
3409/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3410static __inline __m128d __DEFAULT_FN_ATTRS128
3411_mm_maskload_pd(double const *__p, __m128i __m)
3412{
3413 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3414}
3415
3416/// Conditionally loads double-precision floating point elements from a
3417/// memory location pointed to by \a __p into a 256-bit vector of
3418/// [4 x double], depending on the mask bits associated with each data
3419/// element.
3420///
3421/// \headerfile <x86intrin.h>
3422///
3423/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3424///
3425/// \param __p
3426/// A pointer to a memory location that contains the double-precision
3427/// floating point values.
3428/// \param __m
3429/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3430/// significant bit of each quadword element represents the mask bits. If a
3431/// mask bit is zero, the corresponding value in the memory location is not
3432/// loaded and the corresponding field in the return value is set to zero.
3433/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3434static __inline __m256d __DEFAULT_FN_ATTRS
3435_mm256_maskload_pd(double const *__p, __m256i __m)
3436{
3437 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3438 (__v4di)__m);
3439}
3440
3441/// Conditionally loads single-precision floating point elements from a
3442/// memory location pointed to by \a __p into a 128-bit vector of
3443/// [4 x float], depending on the mask bits associated with each data
3444/// element.
3445///
3446/// \headerfile <x86intrin.h>
3447///
3448/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3449///
3450/// \param __p
3451/// A pointer to a memory location that contains the single-precision
3452/// floating point values.
3453/// \param __m
3454/// A 128-bit integer vector containing the mask. The most significant bit of
3455/// each data element represents the mask bits. If a mask bit is zero, the
3456/// corresponding value in the memory location is not loaded and the
3457/// corresponding field in the return value is set to zero.
3458/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3459static __inline __m128 __DEFAULT_FN_ATTRS128
3460_mm_maskload_ps(float const *__p, __m128i __m)
3461{
3462 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3463}
3464
3465/// Conditionally loads single-precision floating point elements from a
3466/// memory location pointed to by \a __p into a 256-bit vector of
3467/// [8 x float], depending on the mask bits associated with each data
3468/// element.
3469///
3470/// \headerfile <x86intrin.h>
3471///
3472/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3473///
3474/// \param __p
3475/// A pointer to a memory location that contains the single-precision
3476/// floating point values.
3477/// \param __m
3478/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3479/// significant bit of each dword element represents the mask bits. If a mask
3480/// bit is zero, the corresponding value in the memory location is not loaded
3481/// and the corresponding field in the return value is set to zero.
3482/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3483static __inline __m256 __DEFAULT_FN_ATTRS
3484_mm256_maskload_ps(float const *__p, __m256i __m)
3485{
3486 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3487}
3488
3489/* Conditional store ops */
3490/// Moves single-precision floating point values from a 256-bit vector
3491/// of [8 x float] to a memory location pointed to by \a __p, according to
3492/// the specified mask.
3493///
3494/// \headerfile <x86intrin.h>
3495///
3496/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3497///
3498/// \param __p
3499/// A pointer to a memory location that will receive the float values.
3500/// \param __m
3501/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3502/// significant bit of each dword element in the mask vector represents the
3503/// mask bits. If a mask bit is zero, the corresponding value from vector
3504/// \a __a is not stored and the corresponding field in the memory location
3505/// pointed to by \a __p is not changed.
3506/// \param __a
3507/// A 256-bit vector of [8 x float] containing the values to be stored.
3508static __inline void __DEFAULT_FN_ATTRS
3509_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3510{
3511 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3512}
3513
3514/// Moves double-precision values from a 128-bit vector of [2 x double]
3515/// to a memory location pointed to by \a __p, according to the specified
3516/// mask.
3517///
3518/// \headerfile <x86intrin.h>
3519///
3520/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3521///
3522/// \param __p
3523/// A pointer to a memory location that will receive the float values.
3524/// \param __m
3525/// A 128-bit integer vector containing the mask. The most significant bit of
3526/// each field in the mask vector represents the mask bits. If a mask bit is
3527/// zero, the corresponding value from vector \a __a is not stored and the
3528/// corresponding field in the memory location pointed to by \a __p is not
3529/// changed.
3530/// \param __a
3531/// A 128-bit vector of [2 x double] containing the values to be stored.
3532static __inline void __DEFAULT_FN_ATTRS128
3533_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3534{
3535 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3536}
3537
3538/// Moves double-precision values from a 256-bit vector of [4 x double]
3539/// to a memory location pointed to by \a __p, according to the specified
3540/// mask.
3541///
3542/// \headerfile <x86intrin.h>
3543///
3544/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3545///
3546/// \param __p
3547/// A pointer to a memory location that will receive the float values.
3548/// \param __m
3549/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3550/// significant bit of each quadword element in the mask vector represents
3551/// the mask bits. If a mask bit is zero, the corresponding value from vector
3552/// __a is not stored and the corresponding field in the memory location
3553/// pointed to by \a __p is not changed.
3554/// \param __a
3555/// A 256-bit vector of [4 x double] containing the values to be stored.
3556static __inline void __DEFAULT_FN_ATTRS
3557_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3558{
3559 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3560}
3561
3562/// Moves single-precision floating point values from a 128-bit vector
3563/// of [4 x float] to a memory location pointed to by \a __p, according to
3564/// the specified mask.
3565///
3566/// \headerfile <x86intrin.h>
3567///
3568/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3569///
3570/// \param __p
3571/// A pointer to a memory location that will receive the float values.
3572/// \param __m
3573/// A 128-bit integer vector containing the mask. The most significant bit of
3574/// each field in the mask vector represents the mask bits. If a mask bit is
3575/// zero, the corresponding value from vector __a is not stored and the
3576/// corresponding field in the memory location pointed to by \a __p is not
3577/// changed.
3578/// \param __a
3579/// A 128-bit vector of [4 x float] containing the values to be stored.
3580static __inline void __DEFAULT_FN_ATTRS128
3581_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3582{
3583 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3584}
3585
3586/* Cacheability support ops */
3587/// Moves integer data from a 256-bit integer vector to a 32-byte
3588/// aligned memory location. To minimize caching, the data is flagged as
3589/// non-temporal (unlikely to be used again soon).
3590///
3591/// \headerfile <x86intrin.h>
3592///
3593/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3594///
3595/// \param __a
3596/// A pointer to a 32-byte aligned memory location that will receive the
3597/// integer values.
3598/// \param __b
3599/// A 256-bit integer vector containing the values to be moved.
3600static __inline void __DEFAULT_FN_ATTRS
3602{
3603 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3604 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3605}
3606
3607/// Moves double-precision values from a 256-bit vector of [4 x double]
3608/// to a 32-byte aligned memory location. To minimize caching, the data is
3609/// flagged as non-temporal (unlikely to be used again soon).
3610///
3611/// \headerfile <x86intrin.h>
3612///
3613/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3614///
3615/// \param __a
3616/// A pointer to a 32-byte aligned memory location that will receive the
3617/// double-precision floating-point values.
3618/// \param __b
3619/// A 256-bit vector of [4 x double] containing the values to be moved.
3620static __inline void __DEFAULT_FN_ATTRS
3621_mm256_stream_pd(void *__a, __m256d __b)
3622{
3623 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3624 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3625}
3626
3627/// Moves single-precision floating point values from a 256-bit vector
3628/// of [8 x float] to a 32-byte aligned memory location. To minimize
3629/// caching, the data is flagged as non-temporal (unlikely to be used again
3630/// soon).
3631///
3632/// \headerfile <x86intrin.h>
3633///
3634/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3635///
3636/// \param __p
3637/// A pointer to a 32-byte aligned memory location that will receive the
3638/// single-precision floating point values.
3639/// \param __a
3640/// A 256-bit vector of [8 x float] containing the values to be moved.
3641static __inline void __DEFAULT_FN_ATTRS
3643{
3644 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3645 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3646}
3647
3648/* Create vectors */
3649/// Create a 256-bit vector of [4 x double] with undefined values.
3650///
3651/// \headerfile <x86intrin.h>
3652///
3653/// This intrinsic has no corresponding instruction.
3654///
3655/// \returns A 256-bit vector of [4 x double] containing undefined values.
3656static __inline__ __m256d __DEFAULT_FN_ATTRS
3658{
3659 return (__m256d)__builtin_ia32_undef256();
3660}
3661
3662/// Create a 256-bit vector of [8 x float] with undefined values.
3663///
3664/// \headerfile <x86intrin.h>
3665///
3666/// This intrinsic has no corresponding instruction.
3667///
3668/// \returns A 256-bit vector of [8 x float] containing undefined values.
3669static __inline__ __m256 __DEFAULT_FN_ATTRS
3671{
3672 return (__m256)__builtin_ia32_undef256();
3673}
3674
3675/// Create a 256-bit integer vector with undefined values.
3676///
3677/// \headerfile <x86intrin.h>
3678///
3679/// This intrinsic has no corresponding instruction.
3680///
3681/// \returns A 256-bit integer vector containing undefined values.
3682static __inline__ __m256i __DEFAULT_FN_ATTRS
3684{
3685 return (__m256i)__builtin_ia32_undef256();
3686}
3687
3688/// Constructs a 256-bit floating-point vector of [4 x double]
3689/// initialized with the specified double-precision floating-point values.
3690///
3691/// \headerfile <x86intrin.h>
3692///
3693/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3694/// instruction.
3695///
3696/// \param __a
3697/// A double-precision floating-point value used to initialize bits [255:192]
3698/// of the result.
3699/// \param __b
3700/// A double-precision floating-point value used to initialize bits [191:128]
3701/// of the result.
3702/// \param __c
3703/// A double-precision floating-point value used to initialize bits [127:64]
3704/// of the result.
3705/// \param __d
3706/// A double-precision floating-point value used to initialize bits [63:0]
3707/// of the result.
3708/// \returns An initialized 256-bit floating-point vector of [4 x double].
3709static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3710_mm256_set_pd(double __a, double __b, double __c, double __d)
3711{
3712 return __extension__ (__m256d){ __d, __c, __b, __a };
3713}
3714
3715/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3716/// with the specified single-precision floating-point values.
3717///
3718/// \headerfile <x86intrin.h>
3719///
3720/// This intrinsic is a utility function and does not correspond to a specific
3721/// instruction.
3722///
3723/// \param __a
3724/// A single-precision floating-point value used to initialize bits [255:224]
3725/// of the result.
3726/// \param __b
3727/// A single-precision floating-point value used to initialize bits [223:192]
3728/// of the result.
3729/// \param __c
3730/// A single-precision floating-point value used to initialize bits [191:160]
3731/// of the result.
3732/// \param __d
3733/// A single-precision floating-point value used to initialize bits [159:128]
3734/// of the result.
3735/// \param __e
3736/// A single-precision floating-point value used to initialize bits [127:96]
3737/// of the result.
3738/// \param __f
3739/// A single-precision floating-point value used to initialize bits [95:64]
3740/// of the result.
3741/// \param __g
3742/// A single-precision floating-point value used to initialize bits [63:32]
3743/// of the result.
3744/// \param __h
3745/// A single-precision floating-point value used to initialize bits [31:0]
3746/// of the result.
3747/// \returns An initialized 256-bit floating-point vector of [8 x float].
3748static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3749_mm256_set_ps(float __a, float __b, float __c, float __d,
3750 float __e, float __f, float __g, float __h)
3751{
3752 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3753}
3754
3755/// Constructs a 256-bit integer vector initialized with the specified
3756/// 32-bit integral values.
3757///
3758/// \headerfile <x86intrin.h>
3759///
3760/// This intrinsic is a utility function and does not correspond to a specific
3761/// instruction.
3762///
3763/// \param __i0
3764/// A 32-bit integral value used to initialize bits [255:224] of the result.
3765/// \param __i1
3766/// A 32-bit integral value used to initialize bits [223:192] of the result.
3767/// \param __i2
3768/// A 32-bit integral value used to initialize bits [191:160] of the result.
3769/// \param __i3
3770/// A 32-bit integral value used to initialize bits [159:128] of the result.
3771/// \param __i4
3772/// A 32-bit integral value used to initialize bits [127:96] of the result.
3773/// \param __i5
3774/// A 32-bit integral value used to initialize bits [95:64] of the result.
3775/// \param __i6
3776/// A 32-bit integral value used to initialize bits [63:32] of the result.
3777/// \param __i7
3778/// A 32-bit integral value used to initialize bits [31:0] of the result.
3779/// \returns An initialized 256-bit integer vector.
3780static __inline __m256i __DEFAULT_FN_ATTRS
3781_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3782 int __i4, int __i5, int __i6, int __i7)
3783{
3784 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3785}
3786
3787/// Constructs a 256-bit integer vector initialized with the specified
3788/// 16-bit integral values.
3789///
3790/// \headerfile <x86intrin.h>
3791///
3792/// This intrinsic is a utility function and does not correspond to a specific
3793/// instruction.
3794///
3795/// \param __w15
3796/// A 16-bit integral value used to initialize bits [255:240] of the result.
3797/// \param __w14
3798/// A 16-bit integral value used to initialize bits [239:224] of the result.
3799/// \param __w13
3800/// A 16-bit integral value used to initialize bits [223:208] of the result.
3801/// \param __w12
3802/// A 16-bit integral value used to initialize bits [207:192] of the result.
3803/// \param __w11
3804/// A 16-bit integral value used to initialize bits [191:176] of the result.
3805/// \param __w10
3806/// A 16-bit integral value used to initialize bits [175:160] of the result.
3807/// \param __w09
3808/// A 16-bit integral value used to initialize bits [159:144] of the result.
3809/// \param __w08
3810/// A 16-bit integral value used to initialize bits [143:128] of the result.
3811/// \param __w07
3812/// A 16-bit integral value used to initialize bits [127:112] of the result.
3813/// \param __w06
3814/// A 16-bit integral value used to initialize bits [111:96] of the result.
3815/// \param __w05
3816/// A 16-bit integral value used to initialize bits [95:80] of the result.
3817/// \param __w04
3818/// A 16-bit integral value used to initialize bits [79:64] of the result.
3819/// \param __w03
3820/// A 16-bit integral value used to initialize bits [63:48] of the result.
3821/// \param __w02
3822/// A 16-bit integral value used to initialize bits [47:32] of the result.
3823/// \param __w01
3824/// A 16-bit integral value used to initialize bits [31:16] of the result.
3825/// \param __w00
3826/// A 16-bit integral value used to initialize bits [15:0] of the result.
3827/// \returns An initialized 256-bit integer vector.
3828static __inline __m256i __DEFAULT_FN_ATTRS
3829_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3830 short __w11, short __w10, short __w09, short __w08,
3831 short __w07, short __w06, short __w05, short __w04,
3832 short __w03, short __w02, short __w01, short __w00)
3833{
3834 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3835 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3836}
3837
3838/// Constructs a 256-bit integer vector initialized with the specified
3839/// 8-bit integral values.
3840///
3841/// \headerfile <x86intrin.h>
3842///
3843/// This intrinsic is a utility function and does not correspond to a specific
3844/// instruction.
3845///
3846/// \param __b31
3847/// An 8-bit integral value used to initialize bits [255:248] of the result.
3848/// \param __b30
3849/// An 8-bit integral value used to initialize bits [247:240] of the result.
3850/// \param __b29
3851/// An 8-bit integral value used to initialize bits [239:232] of the result.
3852/// \param __b28
3853/// An 8-bit integral value used to initialize bits [231:224] of the result.
3854/// \param __b27
3855/// An 8-bit integral value used to initialize bits [223:216] of the result.
3856/// \param __b26
3857/// An 8-bit integral value used to initialize bits [215:208] of the result.
3858/// \param __b25
3859/// An 8-bit integral value used to initialize bits [207:200] of the result.
3860/// \param __b24
3861/// An 8-bit integral value used to initialize bits [199:192] of the result.
3862/// \param __b23
3863/// An 8-bit integral value used to initialize bits [191:184] of the result.
3864/// \param __b22
3865/// An 8-bit integral value used to initialize bits [183:176] of the result.
3866/// \param __b21
3867/// An 8-bit integral value used to initialize bits [175:168] of the result.
3868/// \param __b20
3869/// An 8-bit integral value used to initialize bits [167:160] of the result.
3870/// \param __b19
3871/// An 8-bit integral value used to initialize bits [159:152] of the result.
3872/// \param __b18
3873/// An 8-bit integral value used to initialize bits [151:144] of the result.
3874/// \param __b17
3875/// An 8-bit integral value used to initialize bits [143:136] of the result.
3876/// \param __b16
3877/// An 8-bit integral value used to initialize bits [135:128] of the result.
3878/// \param __b15
3879/// An 8-bit integral value used to initialize bits [127:120] of the result.
3880/// \param __b14
3881/// An 8-bit integral value used to initialize bits [119:112] of the result.
3882/// \param __b13
3883/// An 8-bit integral value used to initialize bits [111:104] of the result.
3884/// \param __b12
3885/// An 8-bit integral value used to initialize bits [103:96] of the result.
3886/// \param __b11
3887/// An 8-bit integral value used to initialize bits [95:88] of the result.
3888/// \param __b10
3889/// An 8-bit integral value used to initialize bits [87:80] of the result.
3890/// \param __b09
3891/// An 8-bit integral value used to initialize bits [79:72] of the result.
3892/// \param __b08
3893/// An 8-bit integral value used to initialize bits [71:64] of the result.
3894/// \param __b07
3895/// An 8-bit integral value used to initialize bits [63:56] of the result.
3896/// \param __b06
3897/// An 8-bit integral value used to initialize bits [55:48] of the result.
3898/// \param __b05
3899/// An 8-bit integral value used to initialize bits [47:40] of the result.
3900/// \param __b04
3901/// An 8-bit integral value used to initialize bits [39:32] of the result.
3902/// \param __b03
3903/// An 8-bit integral value used to initialize bits [31:24] of the result.
3904/// \param __b02
3905/// An 8-bit integral value used to initialize bits [23:16] of the result.
3906/// \param __b01
3907/// An 8-bit integral value used to initialize bits [15:8] of the result.
3908/// \param __b00
3909/// An 8-bit integral value used to initialize bits [7:0] of the result.
3910/// \returns An initialized 256-bit integer vector.
3911static __inline __m256i __DEFAULT_FN_ATTRS
3912_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3913 char __b27, char __b26, char __b25, char __b24,
3914 char __b23, char __b22, char __b21, char __b20,
3915 char __b19, char __b18, char __b17, char __b16,
3916 char __b15, char __b14, char __b13, char __b12,
3917 char __b11, char __b10, char __b09, char __b08,
3918 char __b07, char __b06, char __b05, char __b04,
3919 char __b03, char __b02, char __b01, char __b00)
3920{
3921 return __extension__ (__m256i)(__v32qi){
3922 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3923 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3924 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3925 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3926 };
3927}
3928
3929/// Constructs a 256-bit integer vector initialized with the specified
3930/// 64-bit integral values.
3931///
3932/// \headerfile <x86intrin.h>
3933///
3934/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3935/// instruction.
3936///
3937/// \param __a
3938/// A 64-bit integral value used to initialize bits [255:192] of the result.
3939/// \param __b
3940/// A 64-bit integral value used to initialize bits [191:128] of the result.
3941/// \param __c
3942/// A 64-bit integral value used to initialize bits [127:64] of the result.
3943/// \param __d
3944/// A 64-bit integral value used to initialize bits [63:0] of the result.
3945/// \returns An initialized 256-bit integer vector.
3946static __inline __m256i __DEFAULT_FN_ATTRS
3947_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3948{
3949 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3950}
3951
3952/* Create vectors with elements in reverse order */
3953/// Constructs a 256-bit floating-point vector of [4 x double],
3954/// initialized in reverse order with the specified double-precision
3955/// floating-point values.
3956///
3957/// \headerfile <x86intrin.h>
3958///
3959/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3960/// instruction.
3961///
3962/// \param __a
3963/// A double-precision floating-point value used to initialize bits [63:0]
3964/// of the result.
3965/// \param __b
3966/// A double-precision floating-point value used to initialize bits [127:64]
3967/// of the result.
3968/// \param __c
3969/// A double-precision floating-point value used to initialize bits [191:128]
3970/// of the result.
3971/// \param __d
3972/// A double-precision floating-point value used to initialize bits [255:192]
3973/// of the result.
3974/// \returns An initialized 256-bit floating-point vector of [4 x double].
3975static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3976_mm256_setr_pd(double __a, double __b, double __c, double __d)
3977{
3978 return _mm256_set_pd(__d, __c, __b, __a);
3979}
3980
3981/// Constructs a 256-bit floating-point vector of [8 x float],
3982/// initialized in reverse order with the specified single-precision
3983/// float-point values.
3984///
3985/// \headerfile <x86intrin.h>
3986///
3987/// This intrinsic is a utility function and does not correspond to a specific
3988/// instruction.
3989///
3990/// \param __a
3991/// A single-precision floating-point value used to initialize bits [31:0]
3992/// of the result.
3993/// \param __b
3994/// A single-precision floating-point value used to initialize bits [63:32]
3995/// of the result.
3996/// \param __c
3997/// A single-precision floating-point value used to initialize bits [95:64]
3998/// of the result.
3999/// \param __d
4000/// A single-precision floating-point value used to initialize bits [127:96]
4001/// of the result.
4002/// \param __e
4003/// A single-precision floating-point value used to initialize bits [159:128]
4004/// of the result.
4005/// \param __f
4006/// A single-precision floating-point value used to initialize bits [191:160]
4007/// of the result.
4008/// \param __g
4009/// A single-precision floating-point value used to initialize bits [223:192]
4010/// of the result.
4011/// \param __h
4012/// A single-precision floating-point value used to initialize bits [255:224]
4013/// of the result.
4014/// \returns An initialized 256-bit floating-point vector of [8 x float].
4015static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4016_mm256_setr_ps(float __a, float __b, float __c, float __d,
4017 float __e, float __f, float __g, float __h)
4018{
4019 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
4020}
4021
4022/// Constructs a 256-bit integer vector, initialized in reverse order
4023/// with the specified 32-bit integral values.
4024///
4025/// \headerfile <x86intrin.h>
4026///
4027/// This intrinsic is a utility function and does not correspond to a specific
4028/// instruction.
4029///
4030/// \param __i0
4031/// A 32-bit integral value used to initialize bits [31:0] of the result.
4032/// \param __i1
4033/// A 32-bit integral value used to initialize bits [63:32] of the result.
4034/// \param __i2
4035/// A 32-bit integral value used to initialize bits [95:64] of the result.
4036/// \param __i3
4037/// A 32-bit integral value used to initialize bits [127:96] of the result.
4038/// \param __i4
4039/// A 32-bit integral value used to initialize bits [159:128] of the result.
4040/// \param __i5
4041/// A 32-bit integral value used to initialize bits [191:160] of the result.
4042/// \param __i6
4043/// A 32-bit integral value used to initialize bits [223:192] of the result.
4044/// \param __i7
4045/// A 32-bit integral value used to initialize bits [255:224] of the result.
4046/// \returns An initialized 256-bit integer vector.
4047static __inline __m256i __DEFAULT_FN_ATTRS
4048_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4049 int __i4, int __i5, int __i6, int __i7)
4050{
4051 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
4052}
4053
4054/// Constructs a 256-bit integer vector, initialized in reverse order
4055/// with the specified 16-bit integral values.
4056///
4057/// \headerfile <x86intrin.h>
4058///
4059/// This intrinsic is a utility function and does not correspond to a specific
4060/// instruction.
4061///
4062/// \param __w15
4063/// A 16-bit integral value used to initialize bits [15:0] of the result.
4064/// \param __w14
4065/// A 16-bit integral value used to initialize bits [31:16] of the result.
4066/// \param __w13
4067/// A 16-bit integral value used to initialize bits [47:32] of the result.
4068/// \param __w12
4069/// A 16-bit integral value used to initialize bits [63:48] of the result.
4070/// \param __w11
4071/// A 16-bit integral value used to initialize bits [79:64] of the result.
4072/// \param __w10
4073/// A 16-bit integral value used to initialize bits [95:80] of the result.
4074/// \param __w09
4075/// A 16-bit integral value used to initialize bits [111:96] of the result.
4076/// \param __w08
4077/// A 16-bit integral value used to initialize bits [127:112] of the result.
4078/// \param __w07
4079/// A 16-bit integral value used to initialize bits [143:128] of the result.
4080/// \param __w06
4081/// A 16-bit integral value used to initialize bits [159:144] of the result.
4082/// \param __w05
4083/// A 16-bit integral value used to initialize bits [175:160] of the result.
4084/// \param __w04
4085/// A 16-bit integral value used to initialize bits [191:176] of the result.
4086/// \param __w03
4087/// A 16-bit integral value used to initialize bits [207:192] of the result.
4088/// \param __w02
4089/// A 16-bit integral value used to initialize bits [223:208] of the result.
4090/// \param __w01
4091/// A 16-bit integral value used to initialize bits [239:224] of the result.
4092/// \param __w00
4093/// A 16-bit integral value used to initialize bits [255:240] of the result.
4094/// \returns An initialized 256-bit integer vector.
4095static __inline __m256i __DEFAULT_FN_ATTRS
4096_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4097 short __w11, short __w10, short __w09, short __w08,
4098 short __w07, short __w06, short __w05, short __w04,
4099 short __w03, short __w02, short __w01, short __w00)
4100{
4101 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4102 __w04, __w05, __w06, __w07,
4103 __w08, __w09, __w10, __w11,
4104 __w12, __w13, __w14, __w15);
4105}
4106
4107/// Constructs a 256-bit integer vector, initialized in reverse order
4108/// with the specified 8-bit integral values.
4109///
4110/// \headerfile <x86intrin.h>
4111///
4112/// This intrinsic is a utility function and does not correspond to a specific
4113/// instruction.
4114///
4115/// \param __b31
4116/// An 8-bit integral value used to initialize bits [7:0] of the result.
4117/// \param __b30
4118/// An 8-bit integral value used to initialize bits [15:8] of the result.
4119/// \param __b29
4120/// An 8-bit integral value used to initialize bits [23:16] of the result.
4121/// \param __b28
4122/// An 8-bit integral value used to initialize bits [31:24] of the result.
4123/// \param __b27
4124/// An 8-bit integral value used to initialize bits [39:32] of the result.
4125/// \param __b26
4126/// An 8-bit integral value used to initialize bits [47:40] of the result.
4127/// \param __b25
4128/// An 8-bit integral value used to initialize bits [55:48] of the result.
4129/// \param __b24
4130/// An 8-bit integral value used to initialize bits [63:56] of the result.
4131/// \param __b23
4132/// An 8-bit integral value used to initialize bits [71:64] of the result.
4133/// \param __b22
4134/// An 8-bit integral value used to initialize bits [79:72] of the result.
4135/// \param __b21
4136/// An 8-bit integral value used to initialize bits [87:80] of the result.
4137/// \param __b20
4138/// An 8-bit integral value used to initialize bits [95:88] of the result.
4139/// \param __b19
4140/// An 8-bit integral value used to initialize bits [103:96] of the result.
4141/// \param __b18
4142/// An 8-bit integral value used to initialize bits [111:104] of the result.
4143/// \param __b17
4144/// An 8-bit integral value used to initialize bits [119:112] of the result.
4145/// \param __b16
4146/// An 8-bit integral value used to initialize bits [127:120] of the result.
4147/// \param __b15
4148/// An 8-bit integral value used to initialize bits [135:128] of the result.
4149/// \param __b14
4150/// An 8-bit integral value used to initialize bits [143:136] of the result.
4151/// \param __b13
4152/// An 8-bit integral value used to initialize bits [151:144] of the result.
4153/// \param __b12
4154/// An 8-bit integral value used to initialize bits [159:152] of the result.
4155/// \param __b11
4156/// An 8-bit integral value used to initialize bits [167:160] of the result.
4157/// \param __b10
4158/// An 8-bit integral value used to initialize bits [175:168] of the result.
4159/// \param __b09
4160/// An 8-bit integral value used to initialize bits [183:176] of the result.
4161/// \param __b08
4162/// An 8-bit integral value used to initialize bits [191:184] of the result.
4163/// \param __b07
4164/// An 8-bit integral value used to initialize bits [199:192] of the result.
4165/// \param __b06
4166/// An 8-bit integral value used to initialize bits [207:200] of the result.
4167/// \param __b05
4168/// An 8-bit integral value used to initialize bits [215:208] of the result.
4169/// \param __b04
4170/// An 8-bit integral value used to initialize bits [223:216] of the result.
4171/// \param __b03
4172/// An 8-bit integral value used to initialize bits [231:224] of the result.
4173/// \param __b02
4174/// An 8-bit integral value used to initialize bits [239:232] of the result.
4175/// \param __b01
4176/// An 8-bit integral value used to initialize bits [247:240] of the result.
4177/// \param __b00
4178/// An 8-bit integral value used to initialize bits [255:248] of the result.
4179/// \returns An initialized 256-bit integer vector.
4180static __inline __m256i __DEFAULT_FN_ATTRS
4181_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4182 char __b27, char __b26, char __b25, char __b24,
4183 char __b23, char __b22, char __b21, char __b20,
4184 char __b19, char __b18, char __b17, char __b16,
4185 char __b15, char __b14, char __b13, char __b12,
4186 char __b11, char __b10, char __b09, char __b08,
4187 char __b07, char __b06, char __b05, char __b04,
4188 char __b03, char __b02, char __b01, char __b00)
4189{
4190 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4191 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4192 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4193 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4194}
4195
4196/// Constructs a 256-bit integer vector, initialized in reverse order
4197/// with the specified 64-bit integral values.
4198///
4199/// \headerfile <x86intrin.h>
4200///
4201/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4202/// instruction.
4203///
4204/// \param __a
4205/// A 64-bit integral value used to initialize bits [63:0] of the result.
4206/// \param __b
4207/// A 64-bit integral value used to initialize bits [127:64] of the result.
4208/// \param __c
4209/// A 64-bit integral value used to initialize bits [191:128] of the result.
4210/// \param __d
4211/// A 64-bit integral value used to initialize bits [255:192] of the result.
4212/// \returns An initialized 256-bit integer vector.
4213static __inline __m256i __DEFAULT_FN_ATTRS
4214_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4215{
4216 return _mm256_set_epi64x(__d, __c, __b, __a);
4217}
4218
4219/* Create vectors with repeated elements */
4220/// Constructs a 256-bit floating-point vector of [4 x double], with each
4221/// of the four double-precision floating-point vector elements set to the
4222/// specified double-precision floating-point value.
4223///
4224/// \headerfile <x86intrin.h>
4225///
4226/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4227///
4228/// \param __w
4229/// A double-precision floating-point value used to initialize each vector
4230/// element of the result.
4231/// \returns An initialized 256-bit floating-point vector of [4 x double].
4232static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4234{
4235 return _mm256_set_pd(__w, __w, __w, __w);
4236}
4237
4238/// Constructs a 256-bit floating-point vector of [8 x float], with each
4239/// of the eight single-precision floating-point vector elements set to the
4240/// specified single-precision floating-point value.
4241///
4242/// \headerfile <x86intrin.h>
4243///
4244/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4245/// instruction.
4246///
4247/// \param __w
4248/// A single-precision floating-point value used to initialize each vector
4249/// element of the result.
4250/// \returns An initialized 256-bit floating-point vector of [8 x float].
4251static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4253{
4254 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4255}
4256
4257/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4258/// 32-bit integral vector elements set to the specified 32-bit integral
4259/// value.
4260///
4261/// \headerfile <x86intrin.h>
4262///
4263/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4264/// instruction.
4265///
4266/// \param __i
4267/// A 32-bit integral value used to initialize each vector element of the
4268/// result.
4269/// \returns An initialized 256-bit integer vector of [8 x i32].
4270static __inline __m256i __DEFAULT_FN_ATTRS
4272{
4273 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4274}
4275
4276/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4277/// 16-bit integral vector elements set to the specified 16-bit integral
4278/// value.
4279///
4280/// \headerfile <x86intrin.h>
4281///
4282/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4283///
4284/// \param __w
4285/// A 16-bit integral value used to initialize each vector element of the
4286/// result.
4287/// \returns An initialized 256-bit integer vector of [16 x i16].
4288static __inline __m256i __DEFAULT_FN_ATTRS
4290{
4291 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4292 __w, __w, __w, __w, __w, __w, __w, __w);
4293}
4294
4295/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4296/// 8-bit integral vector elements set to the specified 8-bit integral value.
4297///
4298/// \headerfile <x86intrin.h>
4299///
4300/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4301///
4302/// \param __b
4303/// An 8-bit integral value used to initialize each vector element of the
4304/// result.
4305/// \returns An initialized 256-bit integer vector of [32 x i8].
4306static __inline __m256i __DEFAULT_FN_ATTRS
4308{
4309 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4310 __b, __b, __b, __b, __b, __b, __b, __b,
4311 __b, __b, __b, __b, __b, __b, __b, __b,
4312 __b, __b, __b, __b, __b, __b, __b, __b);
4313}
4314
4315/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4316/// 64-bit integral vector elements set to the specified 64-bit integral
4317/// value.
4318///
4319/// \headerfile <x86intrin.h>
4320///
4321/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4322///
4323/// \param __q
4324/// A 64-bit integral value used to initialize each vector element of the
4325/// result.
4326/// \returns An initialized 256-bit integer vector of [4 x i64].
4327static __inline __m256i __DEFAULT_FN_ATTRS
4329{
4330 return _mm256_set_epi64x(__q, __q, __q, __q);
4331}
4332
4333/* Create __zeroed vectors */
4334/// Constructs a 256-bit floating-point vector of [4 x double] with all
4335/// vector elements initialized to zero.
4336///
4337/// \headerfile <x86intrin.h>
4338///
4339/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4340///
4341/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4343 return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
4344}
4345
4346/// Constructs a 256-bit floating-point vector of [8 x float] with all
4347/// vector elements initialized to zero.
4348///
4349/// \headerfile <x86intrin.h>
4350///
4351/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4352///
4353/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4355 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4356}
4357
4358/// Constructs a 256-bit integer vector initialized to zero.
4359///
4360/// \headerfile <x86intrin.h>
4361///
4362/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4363///
4364/// \returns A 256-bit integer vector initialized to zero.
4365static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4367 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4368}
4369
4370/* Cast between vector types */
4371/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4372/// floating-point vector of [8 x float].
4373///
4374/// \headerfile <x86intrin.h>
4375///
4376/// This intrinsic has no corresponding instruction.
4377///
4378/// \param __a
4379/// A 256-bit floating-point vector of [4 x double].
4380/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4381/// bitwise pattern as the parameter.
4382static __inline __m256 __DEFAULT_FN_ATTRS
4384{
4385 return (__m256)__a;
4386}
4387
4388/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4389/// integer vector.
4390///
4391/// \headerfile <x86intrin.h>
4392///
4393/// This intrinsic has no corresponding instruction.
4394///
4395/// \param __a
4396/// A 256-bit floating-point vector of [4 x double].
4397/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4398/// parameter.
4399static __inline __m256i __DEFAULT_FN_ATTRS
4401{
4402 return (__m256i)__a;
4403}
4404
4405/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4406/// floating-point vector of [4 x double].
4407///
4408/// \headerfile <x86intrin.h>
4409///
4410/// This intrinsic has no corresponding instruction.
4411///
4412/// \param __a
4413/// A 256-bit floating-point vector of [8 x float].
4414/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4415/// bitwise pattern as the parameter.
4416static __inline __m256d __DEFAULT_FN_ATTRS
4418{
4419 return (__m256d)__a;
4420}
4421
4422/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4423/// integer vector.
4424///
4425/// \headerfile <x86intrin.h>
4426///
4427/// This intrinsic has no corresponding instruction.
4428///
4429/// \param __a
4430/// A 256-bit floating-point vector of [8 x float].
4431/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4432/// parameter.
4433static __inline __m256i __DEFAULT_FN_ATTRS
4435{
4436 return (__m256i)__a;
4437}
4438
4439/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4440/// of [8 x float].
4441///
4442/// \headerfile <x86intrin.h>
4443///
4444/// This intrinsic has no corresponding instruction.
4445///
4446/// \param __a
4447/// A 256-bit integer vector.
4448/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4449/// bitwise pattern as the parameter.
4450static __inline __m256 __DEFAULT_FN_ATTRS
4452{
4453 return (__m256)__a;
4454}
4455
4456/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4457/// of [4 x double].
4458///
4459/// \headerfile <x86intrin.h>
4460///
4461/// This intrinsic has no corresponding instruction.
4462///
4463/// \param __a
4464/// A 256-bit integer vector.
4465/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4466/// bitwise pattern as the parameter.
4467static __inline __m256d __DEFAULT_FN_ATTRS
4469{
4470 return (__m256d)__a;
4471}
4472
4473/// Returns the lower 128 bits of a 256-bit floating-point vector of
4474/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4475///
4476/// \headerfile <x86intrin.h>
4477///
4478/// This intrinsic has no corresponding instruction.
4479///
4480/// \param __a
4481/// A 256-bit floating-point vector of [4 x double].
4482/// \returns A 128-bit floating-point vector of [2 x double] containing the
4483/// lower 128 bits of the parameter.
4484static __inline __m128d __DEFAULT_FN_ATTRS
4486{
4487 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4488}
4489
4490/// Returns the lower 128 bits of a 256-bit floating-point vector of
4491/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4492///
4493/// \headerfile <x86intrin.h>
4494///
4495/// This intrinsic has no corresponding instruction.
4496///
4497/// \param __a
4498/// A 256-bit floating-point vector of [8 x float].
4499/// \returns A 128-bit floating-point vector of [4 x float] containing the
4500/// lower 128 bits of the parameter.
4501static __inline __m128 __DEFAULT_FN_ATTRS
4503{
4504 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4505}
4506
4507/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4508///
4509/// \headerfile <x86intrin.h>
4510///
4511/// This intrinsic has no corresponding instruction.
4512///
4513/// \param __a
4514/// A 256-bit integer vector.
4515/// \returns A 128-bit integer vector containing the lower 128 bits of the
4516/// parameter.
4517static __inline __m128i __DEFAULT_FN_ATTRS
4519{
4520 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4521}
4522
4523/// Constructs a 256-bit floating-point vector of [4 x double] from a
4524/// 128-bit floating-point vector of [2 x double].
4525///
4526/// The lower 128 bits contain the value of the source vector. The contents
4527/// of the upper 128 bits are undefined.
4528///
4529/// \headerfile <x86intrin.h>
4530///
4531/// This intrinsic has no corresponding instruction.
4532///
4533/// \param __a
4534/// A 128-bit vector of [2 x double].
4535/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4536/// contain the value of the parameter. The contents of the upper 128 bits
4537/// are undefined.
4538static __inline __m256d __DEFAULT_FN_ATTRS
4540{
4541 return __builtin_shufflevector(
4542 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4543}
4544
4545/// Constructs a 256-bit floating-point vector of [8 x float] from a
4546/// 128-bit floating-point vector of [4 x float].
4547///
4548/// The lower 128 bits contain the value of the source vector. The contents
4549/// of the upper 128 bits are undefined.
4550///
4551/// \headerfile <x86intrin.h>
4552///
4553/// This intrinsic has no corresponding instruction.
4554///
4555/// \param __a
4556/// A 128-bit vector of [4 x float].
4557/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4558/// contain the value of the parameter. The contents of the upper 128 bits
4559/// are undefined.
4560static __inline __m256 __DEFAULT_FN_ATTRS
4562{
4563 return __builtin_shufflevector((__v4sf)__a,
4564 (__v4sf)__builtin_nondeterministic_value(__a),
4565 0, 1, 2, 3, 4, 5, 6, 7);
4566}
4567
4568/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4569///
4570/// The lower 128 bits contain the value of the source vector. The contents
4571/// of the upper 128 bits are undefined.
4572///
4573/// \headerfile <x86intrin.h>
4574///
4575/// This intrinsic has no corresponding instruction.
4576///
4577/// \param __a
4578/// A 128-bit integer vector.
4579/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4580/// the parameter. The contents of the upper 128 bits are undefined.
4581static __inline __m256i __DEFAULT_FN_ATTRS
4583{
4584 return __builtin_shufflevector(
4585 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4586}
4587
4588/// Constructs a 256-bit floating-point vector of [4 x double] from a
4589/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4590/// contain the value of the source vector. The upper 128 bits are set
4591/// to zero.
4592///
4593/// \headerfile <x86intrin.h>
4594///
4595/// This intrinsic has no corresponding instruction.
4596///
4597/// \param __a
4598/// A 128-bit vector of [2 x double].
4599/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4600/// contain the value of the parameter. The upper 128 bits are set to zero.
4601static __inline __m256d __DEFAULT_FN_ATTRS
4603{
4604 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4605}
4606
4607/// Constructs a 256-bit floating-point vector of [8 x float] from a
4608/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4609/// the value of the source vector. The upper 128 bits are set to zero.
4610///
4611/// \headerfile <x86intrin.h>
4612///
4613/// This intrinsic has no corresponding instruction.
4614///
4615/// \param __a
4616/// A 128-bit vector of [4 x float].
4617/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4618/// contain the value of the parameter. The upper 128 bits are set to zero.
4619static __inline __m256 __DEFAULT_FN_ATTRS
4621{
4622 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4623}
4624
4625/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4626/// The lower 128 bits contain the value of the source vector. The upper
4627/// 128 bits are set to zero.
4628///
4629/// \headerfile <x86intrin.h>
4630///
4631/// This intrinsic has no corresponding instruction.
4632///
4633/// \param __a
4634/// A 128-bit integer vector.
4635/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4636/// the parameter. The upper 128 bits are set to zero.
4637static __inline __m256i __DEFAULT_FN_ATTRS
4639{
4640 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4641}
4642
4643/*
4644 Vector insert.
4645 We use macros rather than inlines because we only want to accept
4646 invocations where the immediate M is a constant expression.
4647*/
4648/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4649/// a 256-bit vector of [8 x float] given in the first parameter, and then
4650/// replacing either the upper or the lower 128 bits with the contents of a
4651/// 128-bit vector of [4 x float] in the second parameter.
4652///
4653/// The immediate integer parameter determines between the upper or the lower
4654/// 128 bits.
4655///
4656/// \headerfile <x86intrin.h>
4657///
4658/// \code
4659/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4660/// \endcode
4661///
4662/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4663///
4664/// \param V1
4665/// A 256-bit vector of [8 x float]. This vector is copied to the result
4666/// first, and then either the upper or the lower 128 bits of the result will
4667/// be replaced by the contents of \a V2.
4668/// \param V2
4669/// A 128-bit vector of [4 x float]. The contents of this parameter are
4670/// written to either the upper or the lower 128 bits of the result depending
4671/// on the value of parameter \a M.
4672/// \param M
4673/// An immediate integer. The least significant bit determines how the values
4674/// from the two parameters are interleaved: \n
4675/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4676/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4677/// result. \n
4678/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4679/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4680/// result.
4681/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4682#define _mm256_insertf128_ps(V1, V2, M) \
4683 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4684 (__v4sf)(__m128)(V2), (int)(M)))
4685
4686/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4687/// a 256-bit vector of [4 x double] given in the first parameter, and then
4688/// replacing either the upper or the lower 128 bits with the contents of a
4689/// 128-bit vector of [2 x double] in the second parameter.
4690///
4691/// The immediate integer parameter determines between the upper or the lower
4692/// 128 bits.
4693///
4694/// \headerfile <x86intrin.h>
4695///
4696/// \code
4697/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4698/// \endcode
4699///
4700/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4701///
4702/// \param V1
4703/// A 256-bit vector of [4 x double]. This vector is copied to the result
4704/// first, and then either the upper or the lower 128 bits of the result will
4705/// be replaced by the contents of \a V2.
4706/// \param V2
4707/// A 128-bit vector of [2 x double]. The contents of this parameter are
4708/// written to either the upper or the lower 128 bits of the result depending
4709/// on the value of parameter \a M.
4710/// \param M
4711/// An immediate integer. The least significant bit determines how the values
4712/// from the two parameters are interleaved: \n
4713/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4714/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4715/// result. \n
4716/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4717/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4718/// result.
4719/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4720#define _mm256_insertf128_pd(V1, V2, M) \
4721 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4722 (__v2df)(__m128d)(V2), (int)(M)))
4723
4724/// Constructs a new 256-bit integer vector by first duplicating a
4725/// 256-bit integer vector given in the first parameter, and then replacing
4726/// either the upper or the lower 128 bits with the contents of a 128-bit
4727/// integer vector in the second parameter.
4728///
4729/// The immediate integer parameter determines between the upper or the lower
4730/// 128 bits.
4731///
4732/// \headerfile <x86intrin.h>
4733///
4734/// \code
4735/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4736/// \endcode
4737///
4738/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4739///
4740/// \param V1
4741/// A 256-bit integer vector. This vector is copied to the result first, and
4742/// then either the upper or the lower 128 bits of the result will be
4743/// replaced by the contents of \a V2.
4744/// \param V2
4745/// A 128-bit integer vector. The contents of this parameter are written to
4746/// either the upper or the lower 128 bits of the result depending on the
4747/// value of parameter \a M.
4748/// \param M
4749/// An immediate integer. The least significant bit determines how the values
4750/// from the two parameters are interleaved: \n
4751/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4752/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4753/// result. \n
4754/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4755/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4756/// result.
4757/// \returns A 256-bit integer vector containing the interleaved values.
4758#define _mm256_insertf128_si256(V1, V2, M) \
4759 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4760 (__v4si)(__m128i)(V2), (int)(M)))
4761
4762/*
4763 Vector extract.
4764 We use macros rather than inlines because we only want to accept
4765 invocations where the immediate M is a constant expression.
4766*/
4767/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4768/// of [8 x float], as determined by the immediate integer parameter, and
4769/// returns the extracted bits as a 128-bit vector of [4 x float].
4770///
4771/// \headerfile <x86intrin.h>
4772///
4773/// \code
4774/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4775/// \endcode
4776///
4777/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4778///
4779/// \param V
4780/// A 256-bit vector of [8 x float].
4781/// \param M
4782/// An immediate integer. The least significant bit determines which bits are
4783/// extracted from the first parameter: \n
4784/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4785/// result. \n
4786/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4787/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4788#define _mm256_extractf128_ps(V, M) \
4789 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4790
4791/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4792/// of [4 x double], as determined by the immediate integer parameter, and
4793/// returns the extracted bits as a 128-bit vector of [2 x double].
4794///
4795/// \headerfile <x86intrin.h>
4796///
4797/// \code
4798/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4799/// \endcode
4800///
4801/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4802///
4803/// \param V
4804/// A 256-bit vector of [4 x double].
4805/// \param M
4806/// An immediate integer. The least significant bit determines which bits are
4807/// extracted from the first parameter: \n
4808/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4809/// result. \n
4810/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4811/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4812#define _mm256_extractf128_pd(V, M) \
4813 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4814
4815/// Extracts either the upper or the lower 128 bits from a 256-bit
4816/// integer vector, as determined by the immediate integer parameter, and
4817/// returns the extracted bits as a 128-bit integer vector.
4818///
4819/// \headerfile <x86intrin.h>
4820///
4821/// \code
4822/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4823/// \endcode
4824///
4825/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4826///
4827/// \param V
4828/// A 256-bit integer vector.
4829/// \param M
4830/// An immediate integer. The least significant bit determines which bits are
4831/// extracted from the first parameter: \n
4832/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4833/// result. \n
4834/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4835/// \returns A 128-bit integer vector containing the extracted bits.
4836#define _mm256_extractf128_si256(V, M) \
4837 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4838
4839/// Constructs a 256-bit floating-point vector of [8 x float] by
4840/// concatenating two 128-bit floating-point vectors of [4 x float].
4841///
4842/// \headerfile <x86intrin.h>
4843///
4844/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4845///
4846/// \param __hi
4847/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4848/// 128 bits of the result.
4849/// \param __lo
4850/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4851/// 128 bits of the result.
4852/// \returns A 256-bit floating-point vector of [8 x float] containing the
4853/// concatenated result.
4854static __inline __m256 __DEFAULT_FN_ATTRS
4855_mm256_set_m128 (__m128 __hi, __m128 __lo)
4856{
4857 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4858}
4859
4860/// Constructs a 256-bit floating-point vector of [4 x double] by
4861/// concatenating two 128-bit floating-point vectors of [2 x double].
4862///
4863/// \headerfile <x86intrin.h>
4864///
4865/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4866///
4867/// \param __hi
4868/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4869/// 128 bits of the result.
4870/// \param __lo
4871/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4872/// 128 bits of the result.
4873/// \returns A 256-bit floating-point vector of [4 x double] containing the
4874/// concatenated result.
4875static __inline __m256d __DEFAULT_FN_ATTRS
4876_mm256_set_m128d (__m128d __hi, __m128d __lo)
4877{
4878 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4879}
4880
4881/// Constructs a 256-bit integer vector by concatenating two 128-bit
4882/// integer vectors.
4883///
4884/// \headerfile <x86intrin.h>
4885///
4886/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4887///
4888/// \param __hi
4889/// A 128-bit integer vector to be copied to the upper 128 bits of the
4890/// result.
4891/// \param __lo
4892/// A 128-bit integer vector to be copied to the lower 128 bits of the
4893/// result.
4894/// \returns A 256-bit integer vector containing the concatenated result.
4895static __inline __m256i __DEFAULT_FN_ATTRS
4896_mm256_set_m128i (__m128i __hi, __m128i __lo)
4897{
4898 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4899}
4900
4901/// Constructs a 256-bit floating-point vector of [8 x float] by
4902/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4903/// similar to _mm256_set_m128, but the order of the input parameters is
4904/// swapped.
4905///
4906/// \headerfile <x86intrin.h>
4907///
4908/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4909///
4910/// \param __lo
4911/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4912/// 128 bits of the result.
4913/// \param __hi
4914/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4915/// 128 bits of the result.
4916/// \returns A 256-bit floating-point vector of [8 x float] containing the
4917/// concatenated result.
4918static __inline __m256 __DEFAULT_FN_ATTRS
4919_mm256_setr_m128 (__m128 __lo, __m128 __hi)
4920{
4921 return _mm256_set_m128(__hi, __lo);
4922}
4923
4924/// Constructs a 256-bit floating-point vector of [4 x double] by
4925/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4926/// similar to _mm256_set_m128d, but the order of the input parameters is
4927/// swapped.
4928///
4929/// \headerfile <x86intrin.h>
4930///
4931/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4932///
4933/// \param __lo
4934/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4935/// 128 bits of the result.
4936/// \param __hi
4937/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4938/// 128 bits of the result.
4939/// \returns A 256-bit floating-point vector of [4 x double] containing the
4940/// concatenated result.
4941static __inline __m256d __DEFAULT_FN_ATTRS
4942_mm256_setr_m128d (__m128d __lo, __m128d __hi)
4943{
4944 return (__m256d)_mm256_set_m128d(__hi, __lo);
4945}
4946
4947/// Constructs a 256-bit integer vector by concatenating two 128-bit
4948/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4949/// the input parameters is swapped.
4950///
4951/// \headerfile <x86intrin.h>
4952///
4953/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4954///
4955/// \param __lo
4956/// A 128-bit integer vector to be copied to the lower 128 bits of the
4957/// result.
4958/// \param __hi
4959/// A 128-bit integer vector to be copied to the upper 128 bits of the
4960/// result.
4961/// \returns A 256-bit integer vector containing the concatenated result.
4962static __inline __m256i __DEFAULT_FN_ATTRS
4963_mm256_setr_m128i (__m128i __lo, __m128i __hi)
4964{
4965 return (__m256i)_mm256_set_m128i(__hi, __lo);
4966}
4967
4968/* SIMD load ops (unaligned) */
4969/// Loads two 128-bit floating-point vectors of [4 x float] from
4970/// unaligned memory locations and constructs a 256-bit floating-point vector
4971/// of [8 x float] by concatenating the two 128-bit vectors.
4972///
4973/// \headerfile <x86intrin.h>
4974///
4975/// This intrinsic corresponds to load instructions followed by the
4976/// <c> VINSERTF128 </c> instruction.
4977///
4978/// \param __addr_hi
4979/// A pointer to a 128-bit memory location containing 4 consecutive
4980/// single-precision floating-point values. These values are to be copied to
4981/// bits[255:128] of the result. The address of the memory location does not
4982/// have to be aligned.
4983/// \param __addr_lo
4984/// A pointer to a 128-bit memory location containing 4 consecutive
4985/// single-precision floating-point values. These values are to be copied to
4986/// bits[127:0] of the result. The address of the memory location does not
4987/// have to be aligned.
4988/// \returns A 256-bit floating-point vector of [8 x float] containing the
4989/// concatenated result.
4990static __inline __m256 __DEFAULT_FN_ATTRS
4991_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4992{
4993 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4994}
4995
4996/// Loads two 128-bit floating-point vectors of [2 x double] from
4997/// unaligned memory locations and constructs a 256-bit floating-point vector
4998/// of [4 x double] by concatenating the two 128-bit vectors.
4999///
5000/// \headerfile <x86intrin.h>
5001///
5002/// This intrinsic corresponds to load instructions followed by the
5003/// <c> VINSERTF128 </c> instruction.
5004///
5005/// \param __addr_hi
5006/// A pointer to a 128-bit memory location containing two consecutive
5007/// double-precision floating-point values. These values are to be copied to
5008/// bits[255:128] of the result. The address of the memory location does not
5009/// have to be aligned.
5010/// \param __addr_lo
5011/// A pointer to a 128-bit memory location containing two consecutive
5012/// double-precision floating-point values. These values are to be copied to
5013/// bits[127:0] of the result. The address of the memory location does not
5014/// have to be aligned.
5015/// \returns A 256-bit floating-point vector of [4 x double] containing the
5016/// concatenated result.
5017static __inline __m256d __DEFAULT_FN_ATTRS
5018_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
5019{
5020 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
5021}
5022
5023/// Loads two 128-bit integer vectors from unaligned memory locations and
5024/// constructs a 256-bit integer vector by concatenating the two 128-bit
5025/// vectors.
5026///
5027/// \headerfile <x86intrin.h>
5028///
5029/// This intrinsic corresponds to load instructions followed by the
5030/// <c> VINSERTF128 </c> instruction.
5031///
5032/// \param __addr_hi
5033/// A pointer to a 128-bit memory location containing a 128-bit integer
5034/// vector. This vector is to be copied to bits[255:128] of the result. The
5035/// address of the memory location does not have to be aligned.
5036/// \param __addr_lo
5037/// A pointer to a 128-bit memory location containing a 128-bit integer
5038/// vector. This vector is to be copied to bits[127:0] of the result. The
5039/// address of the memory location does not have to be aligned.
5040/// \returns A 256-bit integer vector containing the concatenated result.
5041static __inline __m256i __DEFAULT_FN_ATTRS
5042_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
5043{
5044 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
5045}
5046
5047/* SIMD store ops (unaligned) */
5048/// Stores the upper and lower 128 bits of a 256-bit floating-point
5049/// vector of [8 x float] into two different unaligned memory locations.
5050///
5051/// \headerfile <x86intrin.h>
5052///
5053/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5054/// store instructions.
5055///
5056/// \param __addr_hi
5057/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5058/// copied to this memory location. The address of this memory location does
5059/// not have to be aligned.
5060/// \param __addr_lo
5061/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5062/// copied to this memory location. The address of this memory location does
5063/// not have to be aligned.
5064/// \param __a
5065/// A 256-bit floating-point vector of [8 x float].
5066static __inline void __DEFAULT_FN_ATTRS
5067_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
5068{
5069 __m128 __v128;
5070
5071 __v128 = _mm256_castps256_ps128(__a);
5072 _mm_storeu_ps(__addr_lo, __v128);
5073 __v128 = _mm256_extractf128_ps(__a, 1);
5074 _mm_storeu_ps(__addr_hi, __v128);
5075}
5076
5077/// Stores the upper and lower 128 bits of a 256-bit floating-point
5078/// vector of [4 x double] into two different unaligned memory locations.
5079///
5080/// \headerfile <x86intrin.h>
5081///
5082/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5083/// store instructions.
5084///
5085/// \param __addr_hi
5086/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5087/// copied to this memory location. The address of this memory location does
5088/// not have to be aligned.
5089/// \param __addr_lo
5090/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5091/// copied to this memory location. The address of this memory location does
5092/// not have to be aligned.
5093/// \param __a
5094/// A 256-bit floating-point vector of [4 x double].
5095static __inline void __DEFAULT_FN_ATTRS
5096_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5097{
5098 __m128d __v128;
5099
5100 __v128 = _mm256_castpd256_pd128(__a);
5101 _mm_storeu_pd(__addr_lo, __v128);
5102 __v128 = _mm256_extractf128_pd(__a, 1);
5103 _mm_storeu_pd(__addr_hi, __v128);
5104}
5105
5106/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5107/// two different unaligned memory locations.
5108///
5109/// \headerfile <x86intrin.h>
5110///
5111/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5112/// store instructions.
5113///
5114/// \param __addr_hi
5115/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5116/// copied to this memory location. The address of this memory location does
5117/// not have to be aligned.
5118/// \param __addr_lo
5119/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5120/// copied to this memory location. The address of this memory location does
5121/// not have to be aligned.
5122/// \param __a
5123/// A 256-bit integer vector.
5124static __inline void __DEFAULT_FN_ATTRS
5125_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5126{
5127 __m128i __v128;
5128
5129 __v128 = _mm256_castsi256_si128(__a);
5130 _mm_storeu_si128(__addr_lo, __v128);
5131 __v128 = _mm256_extractf128_si256(__a, 1);
5132 _mm_storeu_si128(__addr_hi, __v128);
5133}
5134
5135#undef __DEFAULT_FN_ATTRS
5136#undef __DEFAULT_FN_ATTRS_CONSTEXPR
5137#undef __DEFAULT_FN_ATTRS128
5138#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5139
5140#endif /* __AVXINTRIN_H */
__device__ _Float16
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:88
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3075
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4855
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition: avxintrin.h:3119
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition: avxintrin.h:761
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition: avxintrin.h:3325
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2947
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition: avxintrin.h:109
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition: avxintrin.h:3621
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition: avxintrin.h:4451
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition: avxintrin.h:3139
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: avxintrin.h:4181
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition: avxintrin.h:2301
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition: avxintrin.h:3289
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4638
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition: avxintrin.h:3345
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition: avxintrin.h:2488
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition: avxintrin.h:4991
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:373
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3435
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition: avxintrin.h:4434
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:673
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition: avxintrin.h:2209
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:407
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition: avxintrin.h:2421
static __inline __m128d __DEFAULT_FN_ATTRS _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition: avxintrin.h:4485
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition: avxintrin.h:2993
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition: avxintrin.h:2225
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition: avxintrin.h:3670
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2895
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition: avxintrin.h:4502
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition: avxintrin.h:4016
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3460
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3411
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition: avxintrin.h:4307
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition: avxintrin.h:999
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition: avxintrin.h:2260
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition: avxintrin.h:3383
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition: avxintrin.h:4788
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition: avxintrin.h:4836
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition: avxintrin.h:3232
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2974
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition: avxintrin.h:1423
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4876
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition: avxintrin.h:3710
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition: avxintrin.h:4383
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition: avxintrin.h:3601
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition: avxintrin.h:908
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition: avxintrin.h:3657
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:390
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: avxintrin.h:4096
#define __DEFAULT_FN_ATTRS
Definition: avxintrin.h:61
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition: avxintrin.h:3533
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4539
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition: avxintrin.h:4233
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into four signed truncated (rounded toward zero) 32-bit int...
Definition: avxintrin.h:2281
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition: avxintrin.h:3683
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition: avxintrin.h:2244
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition: avxintrin.h:4252
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition: avxintrin.h:4328
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition: avxintrin.h:4468
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2631
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition: avxintrin.h:303
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2865
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:3175
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:691
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition: avxintrin.h:4812
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into eight signed truncated (rounded toward zero) 32-bit integers re...
Definition: avxintrin.h:2321
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4561
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition: avxintrin.h:4963
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition: avxintrin.h:145
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition: avxintrin.h:3947
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3097
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition: avxintrin.h:261
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition: avxintrin.h:4354
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2807
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition: avxintrin.h:5018
static __inline float __DEFAULT_FN_ATTRS _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition: avxintrin.h:2370
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition: avxintrin.h:2396
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition: avxintrin.h:201
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2719
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition: avxintrin.h:5096
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition: avxintrin.h:356
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition: avxintrin.h:164
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition: avxintrin.h:738
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition: avxintrin.h:183
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2748
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition: avxintrin.h:321
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:577
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition: avxintrin.h:3557
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition: avxintrin.h:3509
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:655
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:3192
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition: avxintrin.h:3749
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2921
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4602
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: avxintrin.h:4214
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition: avxintrin.h:4417
static __inline double __DEFAULT_FN_ATTRS _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition: avxintrin.h:2337
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition: avxintrin.h:715
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition: avxintrin.h:784
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2777
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition: avxintrin.h:2194
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4942
#define __DEFAULT_FN_ATTRS128
Definition: avxintrin.h:64
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition: avxintrin.h:598
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: avxintrin.h:4048
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition: avxintrin.h:1451
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition: avxintrin.h:2466
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition: avxintrin.h:5125
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition: avxintrin.h:127
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition: avxintrin.h:4896
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:637
#define __DEFAULT_FN_ATTRS_CONSTEXPR
Definition: avxintrin.h:73
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2572
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition: avxintrin.h:2443
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition: avxintrin.h:3829
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition: avxintrin.h:5042
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition: avxintrin.h:814
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition: avxintrin.h:4342
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition: avxintrin.h:4400
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3484
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition: avxintrin.h:219
static __inline int __DEFAULT_FN_ATTRS _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition: avxintrin.h:2353
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition: avxintrin.h:3642
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4919
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:3248
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition: avxintrin.h:3366
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition: avxintrin.h:282
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition: avxintrin.h:4518
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition: avxintrin.h:3781
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition: avxintrin.h:4366
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4582
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2515
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition: avxintrin.h:3912
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:3212
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3053
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2689
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition: avxintrin.h:3011
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition: avxintrin.h:339
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2542
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition: avxintrin.h:853
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition: avxintrin.h:240
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2660
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition: avxintrin.h:4289
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:3269
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:559
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition: avxintrin.h:3307
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition: avxintrin.h:3976
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition: avxintrin.h:5067
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition: avxintrin.h:4271
double __v4df __attribute__((__vector_size__(32)))
Definition: avxintrin.h:17
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2601
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2836
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4620
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition: avxintrin.h:91
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition: avxintrin.h:3581
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:3159
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition: avxintrin.h:619
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1628
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3880
static __inline__ void int __a
Definition: emmintrin.h:4079
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1876
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3458
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1989
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3911
struct __storeu_i16 *__P __v
Definition: immintrin.h:472
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:2108
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2029
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1871