clang 20.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
54#define __DEFAULT_FN_ATTRS \
55 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
56 __min_vector_width__(256)))
57#define __DEFAULT_FN_ATTRS128 \
58 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
59 __min_vector_width__(128)))
60#else
61#define __DEFAULT_FN_ATTRS \
62 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
63 __min_vector_width__(256)))
64#define __DEFAULT_FN_ATTRS128 \
65 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
66 __min_vector_width__(128)))
67#endif
68
69/* Arithmetic */
70/// Adds two 256-bit vectors of [4 x double].
71///
72/// \headerfile <x86intrin.h>
73///
74/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
75///
76/// \param __a
77/// A 256-bit vector of [4 x double] containing one of the source operands.
78/// \param __b
79/// A 256-bit vector of [4 x double] containing one of the source operands.
80/// \returns A 256-bit vector of [4 x double] containing the sums of both
81/// operands.
82static __inline __m256d __DEFAULT_FN_ATTRS
83_mm256_add_pd(__m256d __a, __m256d __b)
84{
85 return (__m256d)((__v4df)__a+(__v4df)__b);
86}
87
88/// Adds two 256-bit vectors of [8 x float].
89///
90/// \headerfile <x86intrin.h>
91///
92/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
93///
94/// \param __a
95/// A 256-bit vector of [8 x float] containing one of the source operands.
96/// \param __b
97/// A 256-bit vector of [8 x float] containing one of the source operands.
98/// \returns A 256-bit vector of [8 x float] containing the sums of both
99/// operands.
100static __inline __m256 __DEFAULT_FN_ATTRS
101_mm256_add_ps(__m256 __a, __m256 __b)
102{
103 return (__m256)((__v8sf)__a+(__v8sf)__b);
104}
105
106/// Subtracts two 256-bit vectors of [4 x double].
107///
108/// \headerfile <x86intrin.h>
109///
110/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
111///
112/// \param __a
113/// A 256-bit vector of [4 x double] containing the minuend.
114/// \param __b
115/// A 256-bit vector of [4 x double] containing the subtrahend.
116/// \returns A 256-bit vector of [4 x double] containing the differences between
117/// both operands.
118static __inline __m256d __DEFAULT_FN_ATTRS
119_mm256_sub_pd(__m256d __a, __m256d __b)
120{
121 return (__m256d)((__v4df)__a-(__v4df)__b);
122}
123
124/// Subtracts two 256-bit vectors of [8 x float].
125///
126/// \headerfile <x86intrin.h>
127///
128/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
129///
130/// \param __a
131/// A 256-bit vector of [8 x float] containing the minuend.
132/// \param __b
133/// A 256-bit vector of [8 x float] containing the subtrahend.
134/// \returns A 256-bit vector of [8 x float] containing the differences between
135/// both operands.
136static __inline __m256 __DEFAULT_FN_ATTRS
137_mm256_sub_ps(__m256 __a, __m256 __b)
138{
139 return (__m256)((__v8sf)__a-(__v8sf)__b);
140}
141
142/// Adds the even-indexed values and subtracts the odd-indexed values of
143/// two 256-bit vectors of [4 x double].
144///
145/// \headerfile <x86intrin.h>
146///
147/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
148///
149/// \param __a
150/// A 256-bit vector of [4 x double] containing the left source operand.
151/// \param __b
152/// A 256-bit vector of [4 x double] containing the right source operand.
153/// \returns A 256-bit vector of [4 x double] containing the alternating sums
154/// and differences between both operands.
155static __inline __m256d __DEFAULT_FN_ATTRS
156_mm256_addsub_pd(__m256d __a, __m256d __b)
157{
158 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
159}
160
161/// Adds the even-indexed values and subtracts the odd-indexed values of
162/// two 256-bit vectors of [8 x float].
163///
164/// \headerfile <x86intrin.h>
165///
166/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
167///
168/// \param __a
169/// A 256-bit vector of [8 x float] containing the left source operand.
170/// \param __b
171/// A 256-bit vector of [8 x float] containing the right source operand.
172/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
173/// differences between both operands.
174static __inline __m256 __DEFAULT_FN_ATTRS
175_mm256_addsub_ps(__m256 __a, __m256 __b)
176{
177 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
178}
179
180/// Divides two 256-bit vectors of [4 x double].
181///
182/// \headerfile <x86intrin.h>
183///
184/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
185///
186/// \param __a
187/// A 256-bit vector of [4 x double] containing the dividend.
188/// \param __b
189/// A 256-bit vector of [4 x double] containing the divisor.
190/// \returns A 256-bit vector of [4 x double] containing the quotients of both
191/// operands.
192static __inline __m256d __DEFAULT_FN_ATTRS
193_mm256_div_pd(__m256d __a, __m256d __b)
194{
195 return (__m256d)((__v4df)__a/(__v4df)__b);
196}
197
198/// Divides two 256-bit vectors of [8 x float].
199///
200/// \headerfile <x86intrin.h>
201///
202/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
203///
204/// \param __a
205/// A 256-bit vector of [8 x float] containing the dividend.
206/// \param __b
207/// A 256-bit vector of [8 x float] containing the divisor.
208/// \returns A 256-bit vector of [8 x float] containing the quotients of both
209/// operands.
210static __inline __m256 __DEFAULT_FN_ATTRS
211_mm256_div_ps(__m256 __a, __m256 __b)
212{
213 return (__m256)((__v8sf)__a/(__v8sf)__b);
214}
215
216/// Compares two 256-bit vectors of [4 x double] and returns the greater
217/// of each pair of values.
218///
219/// If either value in a comparison is NaN, returns the value from \a __b.
220///
221/// \headerfile <x86intrin.h>
222///
223/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
224///
225/// \param __a
226/// A 256-bit vector of [4 x double] containing one of the operands.
227/// \param __b
228/// A 256-bit vector of [4 x double] containing one of the operands.
229/// \returns A 256-bit vector of [4 x double] containing the maximum values
230/// between both operands.
231static __inline __m256d __DEFAULT_FN_ATTRS
232_mm256_max_pd(__m256d __a, __m256d __b)
233{
234 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
235}
236
237/// Compares two 256-bit vectors of [8 x float] and returns the greater
238/// of each pair of values.
239///
240/// If either value in a comparison is NaN, returns the value from \a __b.
241///
242/// \headerfile <x86intrin.h>
243///
244/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
245///
246/// \param __a
247/// A 256-bit vector of [8 x float] containing one of the operands.
248/// \param __b
249/// A 256-bit vector of [8 x float] containing one of the operands.
250/// \returns A 256-bit vector of [8 x float] containing the maximum values
251/// between both operands.
252static __inline __m256 __DEFAULT_FN_ATTRS
253_mm256_max_ps(__m256 __a, __m256 __b)
254{
255 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
256}
257
258/// Compares two 256-bit vectors of [4 x double] and returns the lesser
259/// of each pair of values.
260///
261/// If either value in a comparison is NaN, returns the value from \a __b.
262///
263/// \headerfile <x86intrin.h>
264///
265/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
266///
267/// \param __a
268/// A 256-bit vector of [4 x double] containing one of the operands.
269/// \param __b
270/// A 256-bit vector of [4 x double] containing one of the operands.
271/// \returns A 256-bit vector of [4 x double] containing the minimum values
272/// between both operands.
273static __inline __m256d __DEFAULT_FN_ATTRS
274_mm256_min_pd(__m256d __a, __m256d __b)
275{
276 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
277}
278
279/// Compares two 256-bit vectors of [8 x float] and returns the lesser
280/// of each pair of values.
281///
282/// If either value in a comparison is NaN, returns the value from \a __b.
283///
284/// \headerfile <x86intrin.h>
285///
286/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
287///
288/// \param __a
289/// A 256-bit vector of [8 x float] containing one of the operands.
290/// \param __b
291/// A 256-bit vector of [8 x float] containing one of the operands.
292/// \returns A 256-bit vector of [8 x float] containing the minimum values
293/// between both operands.
294static __inline __m256 __DEFAULT_FN_ATTRS
295_mm256_min_ps(__m256 __a, __m256 __b)
296{
297 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
298}
299
300/// Multiplies two 256-bit vectors of [4 x double].
301///
302/// \headerfile <x86intrin.h>
303///
304/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
305///
306/// \param __a
307/// A 256-bit vector of [4 x double] containing one of the operands.
308/// \param __b
309/// A 256-bit vector of [4 x double] containing one of the operands.
310/// \returns A 256-bit vector of [4 x double] containing the products of both
311/// operands.
312static __inline __m256d __DEFAULT_FN_ATTRS
313_mm256_mul_pd(__m256d __a, __m256d __b)
314{
315 return (__m256d)((__v4df)__a * (__v4df)__b);
316}
317
318/// Multiplies two 256-bit vectors of [8 x float].
319///
320/// \headerfile <x86intrin.h>
321///
322/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
323///
324/// \param __a
325/// A 256-bit vector of [8 x float] containing one of the operands.
326/// \param __b
327/// A 256-bit vector of [8 x float] containing one of the operands.
328/// \returns A 256-bit vector of [8 x float] containing the products of both
329/// operands.
330static __inline __m256 __DEFAULT_FN_ATTRS
331_mm256_mul_ps(__m256 __a, __m256 __b)
332{
333 return (__m256)((__v8sf)__a * (__v8sf)__b);
334}
335
336/// Calculates the square roots of the values in a 256-bit vector of
337/// [4 x double].
338///
339/// \headerfile <x86intrin.h>
340///
341/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
342///
343/// \param __a
344/// A 256-bit vector of [4 x double].
345/// \returns A 256-bit vector of [4 x double] containing the square roots of the
346/// values in the operand.
347static __inline __m256d __DEFAULT_FN_ATTRS
349{
350 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
351}
352
353/// Calculates the square roots of the values in a 256-bit vector of
354/// [8 x float].
355///
356/// \headerfile <x86intrin.h>
357///
358/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
359///
360/// \param __a
361/// A 256-bit vector of [8 x float].
362/// \returns A 256-bit vector of [8 x float] containing the square roots of the
363/// values in the operand.
364static __inline __m256 __DEFAULT_FN_ATTRS
366{
367 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
368}
369
370/// Calculates the reciprocal square roots of the values in a 256-bit
371/// vector of [8 x float].
372///
373/// \headerfile <x86intrin.h>
374///
375/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
376///
377/// \param __a
378/// A 256-bit vector of [8 x float].
379/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
380/// roots of the values in the operand.
381static __inline __m256 __DEFAULT_FN_ATTRS
383{
384 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
385}
386
387/// Calculates the reciprocals of the values in a 256-bit vector of
388/// [8 x float].
389///
390/// \headerfile <x86intrin.h>
391///
392/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
393///
394/// \param __a
395/// A 256-bit vector of [8 x float].
396/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
397/// values in the operand.
398static __inline __m256 __DEFAULT_FN_ATTRS
400{
401 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
402}
403
404/// Rounds the values in a 256-bit vector of [4 x double] as specified
405/// by the byte operand. The source values are rounded to integer values and
406/// returned as 64-bit double-precision floating-point values.
407///
408/// \headerfile <x86intrin.h>
409///
410/// \code
411/// __m256d _mm256_round_pd(__m256d V, const int M);
412/// \endcode
413///
414/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
415///
416/// \param V
417/// A 256-bit vector of [4 x double].
418/// \param M
419/// An integer value that specifies the rounding operation. \n
420/// Bits [7:4] are reserved. \n
421/// Bit [3] is a precision exception value: \n
422/// 0: A normal PE exception is used. \n
423/// 1: The PE field is not updated. \n
424/// Bit [2] is the rounding control source: \n
425/// 0: Use bits [1:0] of \a M. \n
426/// 1: Use the current MXCSR setting. \n
427/// Bits [1:0] contain the rounding control definition: \n
428/// 00: Nearest. \n
429/// 01: Downward (toward negative infinity). \n
430/// 10: Upward (toward positive infinity). \n
431/// 11: Truncated.
432/// \returns A 256-bit vector of [4 x double] containing the rounded values.
433#define _mm256_round_pd(V, M) \
434 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
435
436/// Rounds the values stored in a 256-bit vector of [8 x float] as
437/// specified by the byte operand. The source values are rounded to integer
438/// values and returned as floating-point values.
439///
440/// \headerfile <x86intrin.h>
441///
442/// \code
443/// __m256 _mm256_round_ps(__m256 V, const int M);
444/// \endcode
445///
446/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
447///
448/// \param V
449/// A 256-bit vector of [8 x float].
450/// \param M
451/// An integer value that specifies the rounding operation. \n
452/// Bits [7:4] are reserved. \n
453/// Bit [3] is a precision exception value: \n
454/// 0: A normal PE exception is used. \n
455/// 1: The PE field is not updated. \n
456/// Bit [2] is the rounding control source: \n
457/// 0: Use bits [1:0] of \a M. \n
458/// 1: Use the current MXCSR setting. \n
459/// Bits [1:0] contain the rounding control definition: \n
460/// 00: Nearest. \n
461/// 01: Downward (toward negative infinity). \n
462/// 10: Upward (toward positive infinity). \n
463/// 11: Truncated.
464/// \returns A 256-bit vector of [8 x float] containing the rounded values.
465#define _mm256_round_ps(V, M) \
466 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
467
468/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
469/// source values are rounded up to integer values and returned as 64-bit
470/// double-precision floating-point values.
471///
472/// \headerfile <x86intrin.h>
473///
474/// \code
475/// __m256d _mm256_ceil_pd(__m256d V);
476/// \endcode
477///
478/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
479///
480/// \param V
481/// A 256-bit vector of [4 x double].
482/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
483#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
484
485/// Rounds down the values stored in a 256-bit vector of [4 x double].
486/// The source values are rounded down to integer values and returned as
487/// 64-bit double-precision floating-point values.
488///
489/// \headerfile <x86intrin.h>
490///
491/// \code
492/// __m256d _mm256_floor_pd(__m256d V);
493/// \endcode
494///
495/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
496///
497/// \param V
498/// A 256-bit vector of [4 x double].
499/// \returns A 256-bit vector of [4 x double] containing the rounded down
500/// values.
501#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
502
503/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
504/// source values are rounded up to integer values and returned as
505/// floating-point values.
506///
507/// \headerfile <x86intrin.h>
508///
509/// \code
510/// __m256 _mm256_ceil_ps(__m256 V);
511/// \endcode
512///
513/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
514///
515/// \param V
516/// A 256-bit vector of [8 x float].
517/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
518#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
519
520/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
521/// source values are rounded down to integer values and returned as
522/// floating-point values.
523///
524/// \headerfile <x86intrin.h>
525///
526/// \code
527/// __m256 _mm256_floor_ps(__m256 V);
528/// \endcode
529///
530/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
531///
532/// \param V
533/// A 256-bit vector of [8 x float].
534/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
535#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
536
537/* Logical */
538/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
539///
540/// \headerfile <x86intrin.h>
541///
542/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
543///
544/// \param __a
545/// A 256-bit vector of [4 x double] containing one of the source operands.
546/// \param __b
547/// A 256-bit vector of [4 x double] containing one of the source operands.
548/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
549/// values between both operands.
550static __inline __m256d __DEFAULT_FN_ATTRS
551_mm256_and_pd(__m256d __a, __m256d __b)
552{
553 return (__m256d)((__v4du)__a & (__v4du)__b);
554}
555
556/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
557///
558/// \headerfile <x86intrin.h>
559///
560/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
561///
562/// \param __a
563/// A 256-bit vector of [8 x float] containing one of the source operands.
564/// \param __b
565/// A 256-bit vector of [8 x float] containing one of the source operands.
566/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
567/// values between both operands.
568static __inline __m256 __DEFAULT_FN_ATTRS
569_mm256_and_ps(__m256 __a, __m256 __b)
570{
571 return (__m256)((__v8su)__a & (__v8su)__b);
572}
573
574/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
575/// the one's complement of the values contained in the first source operand.
576///
577/// \headerfile <x86intrin.h>
578///
579/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
580///
581/// \param __a
582/// A 256-bit vector of [4 x double] containing the left source operand. The
583/// one's complement of this value is used in the bitwise AND.
584/// \param __b
585/// A 256-bit vector of [4 x double] containing the right source operand.
586/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
587/// values of the second operand and the one's complement of the first
588/// operand.
589static __inline __m256d __DEFAULT_FN_ATTRS
590_mm256_andnot_pd(__m256d __a, __m256d __b)
591{
592 return (__m256d)(~(__v4du)__a & (__v4du)__b);
593}
594
595/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
596/// the one's complement of the values contained in the first source operand.
597///
598/// \headerfile <x86intrin.h>
599///
600/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
601///
602/// \param __a
603/// A 256-bit vector of [8 x float] containing the left source operand. The
604/// one's complement of this value is used in the bitwise AND.
605/// \param __b
606/// A 256-bit vector of [8 x float] containing the right source operand.
607/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
608/// values of the second operand and the one's complement of the first
609/// operand.
610static __inline __m256 __DEFAULT_FN_ATTRS
611_mm256_andnot_ps(__m256 __a, __m256 __b)
612{
613 return (__m256)(~(__v8su)__a & (__v8su)__b);
614}
615
616/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
617///
618/// \headerfile <x86intrin.h>
619///
620/// This intrinsic corresponds to the <c> VORPD </c> instruction.
621///
622/// \param __a
623/// A 256-bit vector of [4 x double] containing one of the source operands.
624/// \param __b
625/// A 256-bit vector of [4 x double] containing one of the source operands.
626/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
627/// values between both operands.
628static __inline __m256d __DEFAULT_FN_ATTRS
629_mm256_or_pd(__m256d __a, __m256d __b)
630{
631 return (__m256d)((__v4du)__a | (__v4du)__b);
632}
633
634/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
635///
636/// \headerfile <x86intrin.h>
637///
638/// This intrinsic corresponds to the <c> VORPS </c> instruction.
639///
640/// \param __a
641/// A 256-bit vector of [8 x float] containing one of the source operands.
642/// \param __b
643/// A 256-bit vector of [8 x float] containing one of the source operands.
644/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
645/// values between both operands.
646static __inline __m256 __DEFAULT_FN_ATTRS
647_mm256_or_ps(__m256 __a, __m256 __b)
648{
649 return (__m256)((__v8su)__a | (__v8su)__b);
650}
651
652/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
653///
654/// \headerfile <x86intrin.h>
655///
656/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
657///
658/// \param __a
659/// A 256-bit vector of [4 x double] containing one of the source operands.
660/// \param __b
661/// A 256-bit vector of [4 x double] containing one of the source operands.
662/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
663/// values between both operands.
664static __inline __m256d __DEFAULT_FN_ATTRS
665_mm256_xor_pd(__m256d __a, __m256d __b)
666{
667 return (__m256d)((__v4du)__a ^ (__v4du)__b);
668}
669
670/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
671///
672/// \headerfile <x86intrin.h>
673///
674/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
675///
676/// \param __a
677/// A 256-bit vector of [8 x float] containing one of the source operands.
678/// \param __b
679/// A 256-bit vector of [8 x float] containing one of the source operands.
680/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
681/// values between both operands.
682static __inline __m256 __DEFAULT_FN_ATTRS
683_mm256_xor_ps(__m256 __a, __m256 __b)
684{
685 return (__m256)((__v8su)__a ^ (__v8su)__b);
686}
687
688/* Horizontal arithmetic */
689/// Horizontally adds the adjacent pairs of values contained in two
690/// 256-bit vectors of [4 x double].
691///
692/// \headerfile <x86intrin.h>
693///
694/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
695///
696/// \param __a
697/// A 256-bit vector of [4 x double] containing one of the source operands.
698/// The horizontal sums of the values are returned in the even-indexed
699/// elements of a vector of [4 x double].
700/// \param __b
701/// A 256-bit vector of [4 x double] containing one of the source operands.
702/// The horizontal sums of the values are returned in the odd-indexed
703/// elements of a vector of [4 x double].
704/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
705/// both operands.
706static __inline __m256d __DEFAULT_FN_ATTRS
707_mm256_hadd_pd(__m256d __a, __m256d __b)
708{
709 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
710}
711
712/// Horizontally adds the adjacent pairs of values contained in two
713/// 256-bit vectors of [8 x float].
714///
715/// \headerfile <x86intrin.h>
716///
717/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
718///
719/// \param __a
720/// A 256-bit vector of [8 x float] containing one of the source operands.
721/// The horizontal sums of the values are returned in the elements with
722/// index 0, 1, 4, 5 of a vector of [8 x float].
723/// \param __b
724/// A 256-bit vector of [8 x float] containing one of the source operands.
725/// The horizontal sums of the values are returned in the elements with
726/// index 2, 3, 6, 7 of a vector of [8 x float].
727/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
728/// both operands.
729static __inline __m256 __DEFAULT_FN_ATTRS
730_mm256_hadd_ps(__m256 __a, __m256 __b)
731{
732 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
733}
734
735/// Horizontally subtracts the adjacent pairs of values contained in two
736/// 256-bit vectors of [4 x double].
737///
738/// \headerfile <x86intrin.h>
739///
740/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
741///
742/// \param __a
743/// A 256-bit vector of [4 x double] containing one of the source operands.
744/// The horizontal differences between the values are returned in the
745/// even-indexed elements of a vector of [4 x double].
746/// \param __b
747/// A 256-bit vector of [4 x double] containing one of the source operands.
748/// The horizontal differences between the values are returned in the
749/// odd-indexed elements of a vector of [4 x double].
750/// \returns A 256-bit vector of [4 x double] containing the horizontal
751/// differences of both operands.
752static __inline __m256d __DEFAULT_FN_ATTRS
753_mm256_hsub_pd(__m256d __a, __m256d __b)
754{
755 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
756}
757
758/// Horizontally subtracts the adjacent pairs of values contained in two
759/// 256-bit vectors of [8 x float].
760///
761/// \headerfile <x86intrin.h>
762///
763/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
764///
765/// \param __a
766/// A 256-bit vector of [8 x float] containing one of the source operands.
767/// The horizontal differences between the values are returned in the
768/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
769/// \param __b
770/// A 256-bit vector of [8 x float] containing one of the source operands.
771/// The horizontal differences between the values are returned in the
772/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
773/// \returns A 256-bit vector of [8 x float] containing the horizontal
774/// differences of both operands.
775static __inline __m256 __DEFAULT_FN_ATTRS
776_mm256_hsub_ps(__m256 __a, __m256 __b)
777{
778 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
779}
780
781/* Vector permutations */
782/// Copies the values in a 128-bit vector of [2 x double] as specified
783/// by the 128-bit integer vector operand.
784///
785/// \headerfile <x86intrin.h>
786///
787/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
788///
789/// \param __a
790/// A 128-bit vector of [2 x double].
791/// \param __c
792/// A 128-bit integer vector operand specifying how the values are to be
793/// copied. \n
794/// Bit [1]: \n
795/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
796/// vector. \n
797/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
798/// returned vector. \n
799/// Bit [65]: \n
800/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
801/// returned vector. \n
802/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
803/// returned vector.
804/// \returns A 128-bit vector of [2 x double] containing the copied values.
805static __inline __m128d __DEFAULT_FN_ATTRS128
806_mm_permutevar_pd(__m128d __a, __m128i __c)
807{
808 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
809}
810
811/// Copies the values in a 256-bit vector of [4 x double] as specified
812/// by the 256-bit integer vector operand.
813///
814/// \headerfile <x86intrin.h>
815///
816/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
817///
818/// \param __a
819/// A 256-bit vector of [4 x double].
820/// \param __c
821/// A 256-bit integer vector operand specifying how the values are to be
822/// copied. \n
823/// Bit [1]: \n
824/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
825/// vector. \n
826/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
827/// returned vector. \n
828/// Bit [65]: \n
829/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
830/// returned vector. \n
831/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
832/// returned vector. \n
833/// Bit [129]: \n
834/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
835/// returned vector. \n
836/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
837/// returned vector. \n
838/// Bit [193]: \n
839/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
840/// returned vector. \n
841/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
842/// returned vector.
843/// \returns A 256-bit vector of [4 x double] containing the copied values.
844static __inline __m256d __DEFAULT_FN_ATTRS
845_mm256_permutevar_pd(__m256d __a, __m256i __c)
846{
847 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
848}
849
850/// Copies the values stored in a 128-bit vector of [4 x float] as
851/// specified by the 128-bit integer vector operand.
852///
853/// \headerfile <x86intrin.h>
854///
855/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
856///
857/// \param __a
858/// A 128-bit vector of [4 x float].
859/// \param __c
860/// A 128-bit integer vector operand specifying how the values are to be
861/// copied. \n
862/// Bits [1:0]: \n
863/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
864/// returned vector. \n
865/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
866/// returned vector. \n
867/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
868/// returned vector. \n
869/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
870/// returned vector. \n
871/// Bits [33:32]: \n
872/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
873/// returned vector. \n
874/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
875/// returned vector. \n
876/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
877/// returned vector. \n
878/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
879/// returned vector. \n
880/// Bits [65:64]: \n
881/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
882/// returned vector. \n
883/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
884/// returned vector. \n
885/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
886/// returned vector. \n
887/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
888/// returned vector. \n
889/// Bits [97:96]: \n
890/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
891/// returned vector. \n
892/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
893/// returned vector. \n
894/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
895/// returned vector. \n
896/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
897/// returned vector.
898/// \returns A 128-bit vector of [4 x float] containing the copied values.
899static __inline __m128 __DEFAULT_FN_ATTRS128
900_mm_permutevar_ps(__m128 __a, __m128i __c)
901{
902 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
903}
904
905/// Copies the values stored in a 256-bit vector of [8 x float] as
906/// specified by the 256-bit integer vector operand.
907///
908/// \headerfile <x86intrin.h>
909///
910/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
911///
912/// \param __a
913/// A 256-bit vector of [8 x float].
914/// \param __c
915/// A 256-bit integer vector operand specifying how the values are to be
916/// copied. \n
917/// Bits [1:0]: \n
918/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
919/// returned vector. \n
920/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
921/// returned vector. \n
922/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
923/// returned vector. \n
924/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
925/// returned vector. \n
926/// Bits [33:32]: \n
927/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
928/// returned vector. \n
929/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
930/// returned vector. \n
931/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
932/// returned vector. \n
933/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
934/// returned vector. \n
935/// Bits [65:64]: \n
936/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
937/// returned vector. \n
938/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
939/// returned vector. \n
940/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
941/// returned vector. \n
942/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
943/// returned vector. \n
944/// Bits [97:96]: \n
945/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
946/// returned vector. \n
947/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
948/// returned vector. \n
949/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
950/// returned vector. \n
951/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
952/// returned vector. \n
953/// Bits [129:128]: \n
954/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
955/// returned vector. \n
956/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
957/// returned vector. \n
958/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
959/// returned vector. \n
960/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
961/// returned vector. \n
962/// Bits [161:160]: \n
963/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
964/// returned vector. \n
965/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
966/// returned vector. \n
967/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
968/// returned vector. \n
969/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
970/// returned vector. \n
971/// Bits [193:192]: \n
972/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
973/// returned vector. \n
974/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
975/// returned vector. \n
976/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
977/// returned vector. \n
978/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
979/// returned vector. \n
980/// Bits [225:224]: \n
981/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
982/// returned vector. \n
983/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
984/// returned vector. \n
985/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
986/// returned vector. \n
987/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
988/// returned vector.
989/// \returns A 256-bit vector of [8 x float] containing the copied values.
990static __inline __m256 __DEFAULT_FN_ATTRS
992{
993 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
994}
995
996/// Copies the values in a 128-bit vector of [2 x double] as specified
997/// by the immediate integer operand.
998///
999/// \headerfile <x86intrin.h>
1000///
1001/// \code
1002/// __m128d _mm_permute_pd(__m128d A, const int C);
1003/// \endcode
1004///
1005/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1006///
1007/// \param A
1008/// A 128-bit vector of [2 x double].
1009/// \param C
1010/// An immediate integer operand specifying how the values are to be
1011/// copied. \n
1012/// Bit [0]: \n
1013/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1014/// vector. \n
1015/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1016/// returned vector. \n
1017/// Bit [1]: \n
1018/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1019/// returned vector. \n
1020/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1021/// returned vector.
1022/// \returns A 128-bit vector of [2 x double] containing the copied values.
1023#define _mm_permute_pd(A, C) \
1024 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1025
1026/// Copies the values in a 256-bit vector of [4 x double] as specified by
1027/// the immediate integer operand.
1028///
1029/// \headerfile <x86intrin.h>
1030///
1031/// \code
1032/// __m256d _mm256_permute_pd(__m256d A, const int C);
1033/// \endcode
1034///
1035/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1036///
1037/// \param A
1038/// A 256-bit vector of [4 x double].
1039/// \param C
1040/// An immediate integer operand specifying how the values are to be
1041/// copied. \n
1042/// Bit [0]: \n
1043/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1044/// vector. \n
1045/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1046/// returned vector. \n
1047/// Bit [1]: \n
1048/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1049/// returned vector. \n
1050/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1051/// returned vector. \n
1052/// Bit [2]: \n
1053/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1054/// returned vector. \n
1055/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1056/// returned vector. \n
1057/// Bit [3]: \n
1058/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1059/// returned vector. \n
1060/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1061/// returned vector.
1062/// \returns A 256-bit vector of [4 x double] containing the copied values.
1063#define _mm256_permute_pd(A, C) \
1064 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1065
1066/// Copies the values in a 128-bit vector of [4 x float] as specified by
1067/// the immediate integer operand.
1068///
1069/// \headerfile <x86intrin.h>
1070///
1071/// \code
1072/// __m128 _mm_permute_ps(__m128 A, const int C);
1073/// \endcode
1074///
1075/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1076///
1077/// \param A
1078/// A 128-bit vector of [4 x float].
1079/// \param C
1080/// An immediate integer operand specifying how the values are to be
1081/// copied. \n
1082/// Bits [1:0]: \n
1083/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1084/// returned vector. \n
1085/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1086/// returned vector. \n
1087/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1088/// returned vector. \n
1089/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1090/// returned vector. \n
1091/// Bits [3:2]: \n
1092/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1093/// returned vector. \n
1094/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1095/// returned vector. \n
1096/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1097/// returned vector. \n
1098/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1099/// returned vector. \n
1100/// Bits [5:4]: \n
1101/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1102/// returned vector. \n
1103/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1104/// returned vector. \n
1105/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1106/// returned vector. \n
1107/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1108/// returned vector. \n
1109/// Bits [7:6]: \n
1110/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1111/// returned vector. \n
1112/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1113/// returned vector. \n
1114/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1115/// returned vector. \n
1116/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1117/// returned vector.
1118/// \returns A 128-bit vector of [4 x float] containing the copied values.
1119#define _mm_permute_ps(A, C) \
1120 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1121
1122/// Copies the values in a 256-bit vector of [8 x float] as specified by
1123/// the immediate integer operand.
1124///
1125/// \headerfile <x86intrin.h>
1126///
1127/// \code
1128/// __m256 _mm256_permute_ps(__m256 A, const int C);
1129/// \endcode
1130///
1131/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1132///
1133/// \param A
1134/// A 256-bit vector of [8 x float].
1135/// \param C
1136/// An immediate integer operand specifying how the values are to be
1137/// copied. \n
1138/// Bits [1:0]: \n
1139/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1140/// returned vector. \n
1141/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1142/// returned vector. \n
1143/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1144/// returned vector. \n
1145/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1146/// returned vector. \n
1147/// Bits [3:2]: \n
1148/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1149/// returned vector. \n
1150/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1151/// returned vector. \n
1152/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1153/// returned vector. \n
1154/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1155/// returned vector. \n
1156/// Bits [5:4]: \n
1157/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1158/// returned vector. \n
1159/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1160/// returned vector. \n
1161/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1162/// returned vector. \n
1163/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1164/// returned vector. \n
1165/// Bits [7:6]: \n
1166/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1167/// returned vector. \n
1168/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1169/// returned vector. \n
1170/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1171/// returned vector. \n
1172/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1173/// returned vector. \n
1174/// Bits [1:0]: \n
1175/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1176/// returned vector. \n
1177/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1178/// returned vector. \n
1179/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1180/// returned vector. \n
1181/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1182/// returned vector. \n
1183/// Bits [3:2]: \n
1184/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1185/// returned vector. \n
1186/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1187/// returned vector. \n
1188/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1189/// returned vector. \n
1190/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1191/// returned vector. \n
1192/// Bits [5:4]: \n
1193/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1194/// returned vector. \n
1195/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1196/// returned vector. \n
1197/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1198/// returned vector. \n
1199/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1200/// returned vector. \n
1201/// Bits [7:6]: \n
1202/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1203/// returned vector. \n
1204/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1205/// returned vector. \n
1206/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1207/// returned vector. \n
1208/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1209/// returned vector.
1210/// \returns A 256-bit vector of [8 x float] containing the copied values.
1211#define _mm256_permute_ps(A, C) \
1212 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1213
1214/// Permutes 128-bit data values stored in two 256-bit vectors of
1215/// [4 x double], as specified by the immediate integer operand.
1216///
1217/// \headerfile <x86intrin.h>
1218///
1219/// \code
1220/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1221/// \endcode
1222///
1223/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1224///
1225/// \param V1
1226/// A 256-bit vector of [4 x double].
1227/// \param V2
1228/// A 256-bit vector of [4 x double.
1229/// \param M
1230/// An immediate integer operand specifying how the values are to be
1231/// permuted. \n
1232/// Bits [1:0]: \n
1233/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1234/// destination. \n
1235/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1236/// destination. \n
1237/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1238/// destination. \n
1239/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1240/// destination. \n
1241/// Bits [5:4]: \n
1242/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1243/// destination. \n
1244/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1245/// destination. \n
1246/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1247/// destination. \n
1248/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1249/// destination.
1250/// \returns A 256-bit vector of [4 x double] containing the copied values.
1251#define _mm256_permute2f128_pd(V1, V2, M) \
1252 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1253 (__v4df)(__m256d)(V2), (int)(M)))
1254
1255/// Permutes 128-bit data values stored in two 256-bit vectors of
1256/// [8 x float], as specified by the immediate integer operand.
1257///
1258/// \headerfile <x86intrin.h>
1259///
1260/// \code
1261/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1262/// \endcode
1263///
1264/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1265///
1266/// \param V1
1267/// A 256-bit vector of [8 x float].
1268/// \param V2
1269/// A 256-bit vector of [8 x float].
1270/// \param M
1271/// An immediate integer operand specifying how the values are to be
1272/// permuted. \n
1273/// Bits [1:0]: \n
1274/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1275/// destination. \n
1276/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1277/// destination. \n
1278/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1279/// destination. \n
1280/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1281/// destination. \n
1282/// Bits [5:4]: \n
1283/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1284/// destination. \n
1285/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1286/// destination. \n
1287/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1288/// destination. \n
1289/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1290/// destination.
1291/// \returns A 256-bit vector of [8 x float] containing the copied values.
1292#define _mm256_permute2f128_ps(V1, V2, M) \
1293 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1294 (__v8sf)(__m256)(V2), (int)(M)))
1295
1296/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1297/// as specified by the immediate integer operand.
1298///
1299/// \headerfile <x86intrin.h>
1300///
1301/// \code
1302/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1303/// \endcode
1304///
1305/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1306///
1307/// \param V1
1308/// A 256-bit integer vector.
1309/// \param V2
1310/// A 256-bit integer vector.
1311/// \param M
1312/// An immediate integer operand specifying how the values are to be copied.
1313/// Bits [1:0]: \n
1314/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1315/// destination. \n
1316/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1317/// destination. \n
1318/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1319/// destination. \n
1320/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1321/// destination. \n
1322/// Bits [5:4]: \n
1323/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1324/// destination. \n
1325/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1326/// destination. \n
1327/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1328/// destination. \n
1329/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1330/// destination.
1331/// \returns A 256-bit integer vector containing the copied values.
1332#define _mm256_permute2f128_si256(V1, V2, M) \
1333 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1334 (__v8si)(__m256i)(V2), (int)(M)))
1335
1336/* Vector Blend */
1337/// Merges 64-bit double-precision data values stored in either of the
1338/// two 256-bit vectors of [4 x double], as specified by the immediate
1339/// integer operand.
1340///
1341/// \headerfile <x86intrin.h>
1342///
1343/// \code
1344/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1345/// \endcode
1346///
1347/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1348///
1349/// \param V1
1350/// A 256-bit vector of [4 x double].
1351/// \param V2
1352/// A 256-bit vector of [4 x double].
1353/// \param M
1354/// An immediate integer operand, with mask bits [3:0] specifying how the
1355/// values are to be copied. The position of the mask bit corresponds to the
1356/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1357/// element in operand \a V1 is copied to the same position in the
1358/// destination. When a mask bit is 1, the corresponding 64-bit element in
1359/// operand \a V2 is copied to the same position in the destination.
1360/// \returns A 256-bit vector of [4 x double] containing the copied values.
1361#define _mm256_blend_pd(V1, V2, M) \
1362 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1363 (__v4df)(__m256d)(V2), (int)(M)))
1364
1365/// Merges 32-bit single-precision data values stored in either of the
1366/// two 256-bit vectors of [8 x float], as specified by the immediate
1367/// integer operand.
1368///
1369/// \headerfile <x86intrin.h>
1370///
1371/// \code
1372/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1373/// \endcode
1374///
1375/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1376///
1377/// \param V1
1378/// A 256-bit vector of [8 x float].
1379/// \param V2
1380/// A 256-bit vector of [8 x float].
1381/// \param M
1382/// An immediate integer operand, with mask bits [7:0] specifying how the
1383/// values are to be copied. The position of the mask bit corresponds to the
1384/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1385/// element in operand \a V1 is copied to the same position in the
1386/// destination. When a mask bit is 1, the corresponding 32-bit element in
1387/// operand \a V2 is copied to the same position in the destination.
1388/// \returns A 256-bit vector of [8 x float] containing the copied values.
1389#define _mm256_blend_ps(V1, V2, M) \
1390 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1391 (__v8sf)(__m256)(V2), (int)(M)))
1392
1393/// Merges 64-bit double-precision data values stored in either of the
1394/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1395/// operand.
1396///
1397/// \headerfile <x86intrin.h>
1398///
1399/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1400///
1401/// \param __a
1402/// A 256-bit vector of [4 x double].
1403/// \param __b
1404/// A 256-bit vector of [4 x double].
1405/// \param __c
1406/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1407/// how the values are to be copied. The position of the mask bit corresponds
1408/// to the most significant bit of a copied value. When a mask bit is 0, the
1409/// corresponding 64-bit element in operand \a __a is copied to the same
1410/// position in the destination. When a mask bit is 1, the corresponding
1411/// 64-bit element in operand \a __b is copied to the same position in the
1412/// destination.
1413/// \returns A 256-bit vector of [4 x double] containing the copied values.
1414static __inline __m256d __DEFAULT_FN_ATTRS
1415_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1416{
1417 return (__m256d)__builtin_ia32_blendvpd256(
1418 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1419}
1420
1421/// Merges 32-bit single-precision data values stored in either of the
1422/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1423/// operand.
1424///
1425/// \headerfile <x86intrin.h>
1426///
1427/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1428///
1429/// \param __a
1430/// A 256-bit vector of [8 x float].
1431/// \param __b
1432/// A 256-bit vector of [8 x float].
1433/// \param __c
1434/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1435/// and 31 specifying how the values are to be copied. The position of the
1436/// mask bit corresponds to the most significant bit of a copied value. When
1437/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1438/// copied to the same position in the destination. When a mask bit is 1, the
1439/// corresponding 32-bit element in operand \a __b is copied to the same
1440/// position in the destination.
1441/// \returns A 256-bit vector of [8 x float] containing the copied values.
1442static __inline __m256 __DEFAULT_FN_ATTRS
1443_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1444{
1445 return (__m256)__builtin_ia32_blendvps256(
1446 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1447}
1448
1449/* Vector Dot Product */
1450/// Computes two dot products in parallel, using the lower and upper
1451/// halves of two [8 x float] vectors as input to the two computations, and
1452/// returning the two dot products in the lower and upper halves of the
1453/// [8 x float] result.
1454///
1455/// The immediate integer operand controls which input elements will
1456/// contribute to the dot product, and where the final results are returned.
1457/// In general, for each dot product, the four corresponding elements of the
1458/// input vectors are multiplied; the first two and second two products are
1459/// summed, then the two sums are added to form the final result.
1460///
1461/// \headerfile <x86intrin.h>
1462///
1463/// \code
1464/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1465/// \endcode
1466///
1467/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1468///
1469/// \param V1
1470/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1471/// \param V2
1472/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1473/// \param M
1474/// An immediate integer argument. Bits [7:4] determine which elements of
1475/// the input vectors are used, with bit [4] corresponding to the lowest
1476/// element and bit [7] corresponding to the highest element of each [4 x
1477/// float] subvector. If a bit is set, the corresponding elements from the
1478/// two input vectors are used as an input for dot product; otherwise that
1479/// input is treated as zero. Bits [3:0] determine which elements of the
1480/// result will receive a copy of the final dot product, with bit [0]
1481/// corresponding to the lowest element and bit [3] corresponding to the
1482/// highest element of each [4 x float] subvector. If a bit is set, the dot
1483/// product is returned in the corresponding element; otherwise that element
1484/// is set to zero. The bitmask is applied in the same way to each of the
1485/// two parallel dot product computations.
1486/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1487#define _mm256_dp_ps(V1, V2, M) \
1488 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1489 (__v8sf)(__m256)(V2), (M)))
1490
1491/* Vector shuffle */
1492/// Selects 8 float values from the 256-bit operands of [8 x float], as
1493/// specified by the immediate value operand.
1494///
1495/// The four selected elements in each operand are copied to the destination
1496/// according to the bits specified in the immediate operand. The selected
1497/// elements from the first 256-bit operand are copied to bits [63:0] and
1498/// bits [191:128] of the destination, and the selected elements from the
1499/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1500/// the destination. For example, if bits [7:0] of the immediate operand
1501/// contain a value of 0xFF, the 256-bit destination vector would contain the
1502/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1503///
1504/// \headerfile <x86intrin.h>
1505///
1506/// \code
1507/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1508/// \endcode
1509///
1510/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1511///
1512/// \param a
1513/// A 256-bit vector of [8 x float]. The four selected elements in this
1514/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1515/// according to the bits specified in the immediate operand.
1516/// \param b
1517/// A 256-bit vector of [8 x float]. The four selected elements in this
1518/// operand are copied to bits [127:64] and bits [255:192] in the
1519/// destination, according to the bits specified in the immediate operand.
1520/// \param mask
1521/// An immediate value containing an 8-bit value specifying which elements to
1522/// copy from \a a and \a b \n.
1523/// Bits [3:0] specify the values copied from operand \a a. \n
1524/// Bits [7:4] specify the values copied from operand \a b. \n
1525/// The destinations within the 256-bit destination are assigned values as
1526/// follows, according to the bit value assignments described below: \n
1527/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1528/// destination. \n
1529/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1530/// destination. \n
1531/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1532/// destination. \n
1533/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1534/// the destination. \n
1535/// Bit value assignments: \n
1536/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1537/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1538/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1539/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1540/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1541/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1542/// <c>[b6, b4, b2, b0]</c>.
1543/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1544#define _mm256_shuffle_ps(a, b, mask) \
1545 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1546 (__v8sf)(__m256)(b), (int)(mask)))
1547
1548/// Selects four double-precision values from the 256-bit operands of
1549/// [4 x double], as specified by the immediate value operand.
1550///
1551/// The selected elements from the first 256-bit operand are copied to bits
1552/// [63:0] and bits [191:128] in the destination, and the selected elements
1553/// from the second 256-bit operand are copied to bits [127:64] and bits
1554/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1555/// operand contain a value of 0xF, the 256-bit destination vector would
1556/// contain the following values: b[3], a[3], b[1], a[1].
1557///
1558/// \headerfile <x86intrin.h>
1559///
1560/// \code
1561/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1562/// \endcode
1563///
1564/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1565///
1566/// \param a
1567/// A 256-bit vector of [4 x double].
1568/// \param b
1569/// A 256-bit vector of [4 x double].
1570/// \param mask
1571/// An immediate value containing 8-bit values specifying which elements to
1572/// copy from \a a and \a b: \n
1573/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1574/// destination. \n
1575/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1576/// destination. \n
1577/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1578/// destination. \n
1579/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1580/// destination. \n
1581/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1582/// destination. \n
1583/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1584/// destination. \n
1585/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1586/// destination. \n
1587/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1588/// destination.
1589/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1590#define _mm256_shuffle_pd(a, b, mask) \
1591 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1592 (__v4df)(__m256d)(b), (int)(mask)))
1593
1594/* Compare */
1595#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1596#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1597#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1598#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1599#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1600#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1601#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1602#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1603#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1604#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1605#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1606#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1607#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1608#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1609#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1610#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1611#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1612#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1613#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1614#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1615#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1616#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1617#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1618#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1619
1620/* Below intrinsic defined in emmintrin.h can be used for AVX */
1621/// Compares each of the corresponding double-precision values of two
1622/// 128-bit vectors of [2 x double], using the operation specified by the
1623/// immediate integer operand.
1624///
1625/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1626/// If either value in a comparison is NaN, comparisons that are ordered
1627/// return false, and comparisons that are unordered return true.
1628///
1629/// \headerfile <x86intrin.h>
1630///
1631/// \code
1632/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1633/// \endcode
1634///
1635/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1636///
1637/// \param a
1638/// A 128-bit vector of [2 x double].
1639/// \param b
1640/// A 128-bit vector of [2 x double].
1641/// \param c
1642/// An immediate integer operand, with bits [4:0] specifying which comparison
1643/// operation to use: \n
1644/// 0x00: Equal (ordered, non-signaling) \n
1645/// 0x01: Less-than (ordered, signaling) \n
1646/// 0x02: Less-than-or-equal (ordered, signaling) \n
1647/// 0x03: Unordered (non-signaling) \n
1648/// 0x04: Not-equal (unordered, non-signaling) \n
1649/// 0x05: Not-less-than (unordered, signaling) \n
1650/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1651/// 0x07: Ordered (non-signaling) \n
1652/// 0x08: Equal (unordered, non-signaling) \n
1653/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1654/// 0x0A: Not-greater-than (unordered, signaling) \n
1655/// 0x0B: False (ordered, non-signaling) \n
1656/// 0x0C: Not-equal (ordered, non-signaling) \n
1657/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1658/// 0x0E: Greater-than (ordered, signaling) \n
1659/// 0x0F: True (unordered, non-signaling) \n
1660/// 0x10: Equal (ordered, signaling) \n
1661/// 0x11: Less-than (ordered, non-signaling) \n
1662/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1663/// 0x13: Unordered (signaling) \n
1664/// 0x14: Not-equal (unordered, signaling) \n
1665/// 0x15: Not-less-than (unordered, non-signaling) \n
1666/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1667/// 0x17: Ordered (signaling) \n
1668/// 0x18: Equal (unordered, signaling) \n
1669/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1670/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1671/// 0x1B: False (ordered, signaling) \n
1672/// 0x1C: Not-equal (ordered, signaling) \n
1673/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1674/// 0x1E: Greater-than (ordered, non-signaling) \n
1675/// 0x1F: True (unordered, signaling)
1676/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1677/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1678
1679/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1680/// Compares each of the corresponding values of two 128-bit vectors of
1681/// [4 x float], using the operation specified by the immediate integer
1682/// operand.
1683///
1684/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1685/// If either value in a comparison is NaN, comparisons that are ordered
1686/// return false, and comparisons that are unordered return true.
1687///
1688/// \headerfile <x86intrin.h>
1689///
1690/// \code
1691/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1692/// \endcode
1693///
1694/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1695///
1696/// \param a
1697/// A 128-bit vector of [4 x float].
1698/// \param b
1699/// A 128-bit vector of [4 x float].
1700/// \param c
1701/// An immediate integer operand, with bits [4:0] specifying which comparison
1702/// operation to use: \n
1703/// 0x00: Equal (ordered, non-signaling) \n
1704/// 0x01: Less-than (ordered, signaling) \n
1705/// 0x02: Less-than-or-equal (ordered, signaling) \n
1706/// 0x03: Unordered (non-signaling) \n
1707/// 0x04: Not-equal (unordered, non-signaling) \n
1708/// 0x05: Not-less-than (unordered, signaling) \n
1709/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1710/// 0x07: Ordered (non-signaling) \n
1711/// 0x08: Equal (unordered, non-signaling) \n
1712/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1713/// 0x0A: Not-greater-than (unordered, signaling) \n
1714/// 0x0B: False (ordered, non-signaling) \n
1715/// 0x0C: Not-equal (ordered, non-signaling) \n
1716/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1717/// 0x0E: Greater-than (ordered, signaling) \n
1718/// 0x0F: True (unordered, non-signaling) \n
1719/// 0x10: Equal (ordered, signaling) \n
1720/// 0x11: Less-than (ordered, non-signaling) \n
1721/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1722/// 0x13: Unordered (signaling) \n
1723/// 0x14: Not-equal (unordered, signaling) \n
1724/// 0x15: Not-less-than (unordered, non-signaling) \n
1725/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1726/// 0x17: Ordered (signaling) \n
1727/// 0x18: Equal (unordered, signaling) \n
1728/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1729/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1730/// 0x1B: False (ordered, signaling) \n
1731/// 0x1C: Not-equal (ordered, signaling) \n
1732/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1733/// 0x1E: Greater-than (ordered, non-signaling) \n
1734/// 0x1F: True (unordered, signaling)
1735/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1736/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1737
1738/// Compares each of the corresponding double-precision values of two
1739/// 256-bit vectors of [4 x double], using the operation specified by the
1740/// immediate integer operand.
1741///
1742/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1743/// If either value in a comparison is NaN, comparisons that are ordered
1744/// return false, and comparisons that are unordered return true.
1745///
1746/// \headerfile <x86intrin.h>
1747///
1748/// \code
1749/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1750/// \endcode
1751///
1752/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1753///
1754/// \param a
1755/// A 256-bit vector of [4 x double].
1756/// \param b
1757/// A 256-bit vector of [4 x double].
1758/// \param c
1759/// An immediate integer operand, with bits [4:0] specifying which comparison
1760/// operation to use: \n
1761/// 0x00: Equal (ordered, non-signaling) \n
1762/// 0x01: Less-than (ordered, signaling) \n
1763/// 0x02: Less-than-or-equal (ordered, signaling) \n
1764/// 0x03: Unordered (non-signaling) \n
1765/// 0x04: Not-equal (unordered, non-signaling) \n
1766/// 0x05: Not-less-than (unordered, signaling) \n
1767/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1768/// 0x07: Ordered (non-signaling) \n
1769/// 0x08: Equal (unordered, non-signaling) \n
1770/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1771/// 0x0A: Not-greater-than (unordered, signaling) \n
1772/// 0x0B: False (ordered, non-signaling) \n
1773/// 0x0C: Not-equal (ordered, non-signaling) \n
1774/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1775/// 0x0E: Greater-than (ordered, signaling) \n
1776/// 0x0F: True (unordered, non-signaling) \n
1777/// 0x10: Equal (ordered, signaling) \n
1778/// 0x11: Less-than (ordered, non-signaling) \n
1779/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1780/// 0x13: Unordered (signaling) \n
1781/// 0x14: Not-equal (unordered, signaling) \n
1782/// 0x15: Not-less-than (unordered, non-signaling) \n
1783/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1784/// 0x17: Ordered (signaling) \n
1785/// 0x18: Equal (unordered, signaling) \n
1786/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1787/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1788/// 0x1B: False (ordered, signaling) \n
1789/// 0x1C: Not-equal (ordered, signaling) \n
1790/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1791/// 0x1E: Greater-than (ordered, non-signaling) \n
1792/// 0x1F: True (unordered, signaling)
1793/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1794#define _mm256_cmp_pd(a, b, c) \
1795 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1796 (__v4df)(__m256d)(b), (c)))
1797
1798/// Compares each of the corresponding values of two 256-bit vectors of
1799/// [8 x float], using the operation specified by the immediate integer
1800/// operand.
1801///
1802/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1803/// If either value in a comparison is NaN, comparisons that are ordered
1804/// return false, and comparisons that are unordered return true.
1805///
1806/// \headerfile <x86intrin.h>
1807///
1808/// \code
1809/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1810/// \endcode
1811///
1812/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1813///
1814/// \param a
1815/// A 256-bit vector of [8 x float].
1816/// \param b
1817/// A 256-bit vector of [8 x float].
1818/// \param c
1819/// An immediate integer operand, with bits [4:0] specifying which comparison
1820/// operation to use: \n
1821/// 0x00: Equal (ordered, non-signaling) \n
1822/// 0x01: Less-than (ordered, signaling) \n
1823/// 0x02: Less-than-or-equal (ordered, signaling) \n
1824/// 0x03: Unordered (non-signaling) \n
1825/// 0x04: Not-equal (unordered, non-signaling) \n
1826/// 0x05: Not-less-than (unordered, signaling) \n
1827/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1828/// 0x07: Ordered (non-signaling) \n
1829/// 0x08: Equal (unordered, non-signaling) \n
1830/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1831/// 0x0A: Not-greater-than (unordered, signaling) \n
1832/// 0x0B: False (ordered, non-signaling) \n
1833/// 0x0C: Not-equal (ordered, non-signaling) \n
1834/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1835/// 0x0E: Greater-than (ordered, signaling) \n
1836/// 0x0F: True (unordered, non-signaling) \n
1837/// 0x10: Equal (ordered, signaling) \n
1838/// 0x11: Less-than (ordered, non-signaling) \n
1839/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1840/// 0x13: Unordered (signaling) \n
1841/// 0x14: Not-equal (unordered, signaling) \n
1842/// 0x15: Not-less-than (unordered, non-signaling) \n
1843/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1844/// 0x17: Ordered (signaling) \n
1845/// 0x18: Equal (unordered, signaling) \n
1846/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1847/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1848/// 0x1B: False (ordered, signaling) \n
1849/// 0x1C: Not-equal (ordered, signaling) \n
1850/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1851/// 0x1E: Greater-than (ordered, non-signaling) \n
1852/// 0x1F: True (unordered, signaling)
1853/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1854#define _mm256_cmp_ps(a, b, c) \
1855 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1856 (__v8sf)(__m256)(b), (c)))
1857
1858/* Below intrinsic defined in emmintrin.h can be used for AVX */
1859/// Compares each of the corresponding scalar double-precision values of
1860/// two 128-bit vectors of [2 x double], using the operation specified by the
1861/// immediate integer operand.
1862///
1863/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1864/// If either value in a comparison is NaN, comparisons that are ordered
1865/// return false, and comparisons that are unordered return true.
1866///
1867/// \headerfile <x86intrin.h>
1868///
1869/// \code
1870/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1871/// \endcode
1872///
1873/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1874///
1875/// \param a
1876/// A 128-bit vector of [2 x double].
1877/// \param b
1878/// A 128-bit vector of [2 x double].
1879/// \param c
1880/// An immediate integer operand, with bits [4:0] specifying which comparison
1881/// operation to use: \n
1882/// 0x00: Equal (ordered, non-signaling) \n
1883/// 0x01: Less-than (ordered, signaling) \n
1884/// 0x02: Less-than-or-equal (ordered, signaling) \n
1885/// 0x03: Unordered (non-signaling) \n
1886/// 0x04: Not-equal (unordered, non-signaling) \n
1887/// 0x05: Not-less-than (unordered, signaling) \n
1888/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1889/// 0x07: Ordered (non-signaling) \n
1890/// 0x08: Equal (unordered, non-signaling) \n
1891/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1892/// 0x0A: Not-greater-than (unordered, signaling) \n
1893/// 0x0B: False (ordered, non-signaling) \n
1894/// 0x0C: Not-equal (ordered, non-signaling) \n
1895/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1896/// 0x0E: Greater-than (ordered, signaling) \n
1897/// 0x0F: True (unordered, non-signaling) \n
1898/// 0x10: Equal (ordered, signaling) \n
1899/// 0x11: Less-than (ordered, non-signaling) \n
1900/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1901/// 0x13: Unordered (signaling) \n
1902/// 0x14: Not-equal (unordered, signaling) \n
1903/// 0x15: Not-less-than (unordered, non-signaling) \n
1904/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1905/// 0x17: Ordered (signaling) \n
1906/// 0x18: Equal (unordered, signaling) \n
1907/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1908/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1909/// 0x1B: False (ordered, signaling) \n
1910/// 0x1C: Not-equal (ordered, signaling) \n
1911/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1912/// 0x1E: Greater-than (ordered, non-signaling) \n
1913/// 0x1F: True (unordered, signaling)
1914/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1915/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1916
1917/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1918/// Compares each of the corresponding scalar values of two 128-bit
1919/// vectors of [4 x float], using the operation specified by the immediate
1920/// integer operand.
1921///
1922/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1923/// If either value in a comparison is NaN, comparisons that are ordered
1924/// return false, and comparisons that are unordered return true.
1925///
1926/// \headerfile <x86intrin.h>
1927///
1928/// \code
1929/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1930/// \endcode
1931///
1932/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1933///
1934/// \param a
1935/// A 128-bit vector of [4 x float].
1936/// \param b
1937/// A 128-bit vector of [4 x float].
1938/// \param c
1939/// An immediate integer operand, with bits [4:0] specifying which comparison
1940/// operation to use: \n
1941/// 0x00: Equal (ordered, non-signaling) \n
1942/// 0x01: Less-than (ordered, signaling) \n
1943/// 0x02: Less-than-or-equal (ordered, signaling) \n
1944/// 0x03: Unordered (non-signaling) \n
1945/// 0x04: Not-equal (unordered, non-signaling) \n
1946/// 0x05: Not-less-than (unordered, signaling) \n
1947/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1948/// 0x07: Ordered (non-signaling) \n
1949/// 0x08: Equal (unordered, non-signaling) \n
1950/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1951/// 0x0A: Not-greater-than (unordered, signaling) \n
1952/// 0x0B: False (ordered, non-signaling) \n
1953/// 0x0C: Not-equal (ordered, non-signaling) \n
1954/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1955/// 0x0E: Greater-than (ordered, signaling) \n
1956/// 0x0F: True (unordered, non-signaling) \n
1957/// 0x10: Equal (ordered, signaling) \n
1958/// 0x11: Less-than (ordered, non-signaling) \n
1959/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1960/// 0x13: Unordered (signaling) \n
1961/// 0x14: Not-equal (unordered, signaling) \n
1962/// 0x15: Not-less-than (unordered, non-signaling) \n
1963/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1964/// 0x17: Ordered (signaling) \n
1965/// 0x18: Equal (unordered, signaling) \n
1966/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1967/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1968/// 0x1B: False (ordered, signaling) \n
1969/// 0x1C: Not-equal (ordered, signaling) \n
1970/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1971/// 0x1E: Greater-than (ordered, non-signaling) \n
1972/// 0x1F: True (unordered, signaling)
1973/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1974/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1975
1976/// Takes a [8 x i32] vector and returns the vector element value
1977/// indexed by the immediate constant operand.
1978///
1979/// \headerfile <x86intrin.h>
1980///
1981/// \code
1982/// int _mm256_extract_epi32(__m256i X, const int N);
1983/// \endcode
1984///
1985/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1986/// instruction.
1987///
1988/// \param X
1989/// A 256-bit vector of [8 x i32].
1990/// \param N
1991/// An immediate integer operand with bits [2:0] determining which vector
1992/// element is extracted and returned.
1993/// \returns A 32-bit integer containing the extracted 32 bits of extended
1994/// packed data.
1995#define _mm256_extract_epi32(X, N) \
1996 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1997
1998/// Takes a [16 x i16] vector and returns the vector element value
1999/// indexed by the immediate constant operand.
2000///
2001/// \headerfile <x86intrin.h>
2002///
2003/// \code
2004/// int _mm256_extract_epi16(__m256i X, const int N);
2005/// \endcode
2006///
2007/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2008/// instruction.
2009///
2010/// \param X
2011/// A 256-bit integer vector of [16 x i16].
2012/// \param N
2013/// An immediate integer operand with bits [3:0] determining which vector
2014/// element is extracted and returned.
2015/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2016/// packed data.
2017#define _mm256_extract_epi16(X, N) \
2018 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2019 (int)(N)))
2020
2021/// Takes a [32 x i8] vector and returns the vector element value
2022/// indexed by the immediate constant operand.
2023///
2024/// \headerfile <x86intrin.h>
2025///
2026/// \code
2027/// int _mm256_extract_epi8(__m256i X, const int N);
2028/// \endcode
2029///
2030/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2031/// instruction.
2032///
2033/// \param X
2034/// A 256-bit integer vector of [32 x i8].
2035/// \param N
2036/// An immediate integer operand with bits [4:0] determining which vector
2037/// element is extracted and returned.
2038/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2039/// packed data.
2040#define _mm256_extract_epi8(X, N) \
2041 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2042 (int)(N)))
2043
2044#ifdef __x86_64__
2045/// Takes a [4 x i64] vector and returns the vector element value
2046/// indexed by the immediate constant operand.
2047///
2048/// \headerfile <x86intrin.h>
2049///
2050/// \code
2051/// long long _mm256_extract_epi64(__m256i X, const int N);
2052/// \endcode
2053///
2054/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2055/// instruction.
2056///
2057/// \param X
2058/// A 256-bit integer vector of [4 x i64].
2059/// \param N
2060/// An immediate integer operand with bits [1:0] determining which vector
2061/// element is extracted and returned.
2062/// \returns A 64-bit integer containing the extracted 64 bits of extended
2063/// packed data.
2064#define _mm256_extract_epi64(X, N) \
2065 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2066#endif
2067
2068/// Takes a [8 x i32] vector and replaces the vector element value
2069/// indexed by the immediate constant operand by a new value. Returns the
2070/// modified vector.
2071///
2072/// \headerfile <x86intrin.h>
2073///
2074/// \code
2075/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2076/// \endcode
2077///
2078/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2079/// instruction.
2080///
2081/// \param X
2082/// A vector of [8 x i32] to be used by the insert operation.
2083/// \param I
2084/// An integer value. The replacement value for the insert operation.
2085/// \param N
2086/// An immediate integer specifying the index of the vector element to be
2087/// replaced.
2088/// \returns A copy of vector \a X, after replacing its element indexed by
2089/// \a N with \a I.
2090#define _mm256_insert_epi32(X, I, N) \
2091 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2092 (int)(I), (int)(N)))
2093
2094
2095/// Takes a [16 x i16] vector and replaces the vector element value
2096/// indexed by the immediate constant operand with a new value. Returns the
2097/// modified vector.
2098///
2099/// \headerfile <x86intrin.h>
2100///
2101/// \code
2102/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2103/// \endcode
2104///
2105/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2106/// instruction.
2107///
2108/// \param X
2109/// A vector of [16 x i16] to be used by the insert operation.
2110/// \param I
2111/// An i16 integer value. The replacement value for the insert operation.
2112/// \param N
2113/// An immediate integer specifying the index of the vector element to be
2114/// replaced.
2115/// \returns A copy of vector \a X, after replacing its element indexed by
2116/// \a N with \a I.
2117#define _mm256_insert_epi16(X, I, N) \
2118 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2119 (int)(I), (int)(N)))
2120
2121/// Takes a [32 x i8] vector and replaces the vector element value
2122/// indexed by the immediate constant operand with a new value. Returns the
2123/// modified vector.
2124///
2125/// \headerfile <x86intrin.h>
2126///
2127/// \code
2128/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2129/// \endcode
2130///
2131/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2132/// instruction.
2133///
2134/// \param X
2135/// A vector of [32 x i8] to be used by the insert operation.
2136/// \param I
2137/// An i8 integer value. The replacement value for the insert operation.
2138/// \param N
2139/// An immediate integer specifying the index of the vector element to be
2140/// replaced.
2141/// \returns A copy of vector \a X, after replacing its element indexed by
2142/// \a N with \a I.
2143#define _mm256_insert_epi8(X, I, N) \
2144 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2145 (int)(I), (int)(N)))
2146
2147#ifdef __x86_64__
2148/// Takes a [4 x i64] vector and replaces the vector element value
2149/// indexed by the immediate constant operand with a new value. Returns the
2150/// modified vector.
2151///
2152/// \headerfile <x86intrin.h>
2153///
2154/// \code
2155/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2156/// \endcode
2157///
2158/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2159/// instruction.
2160///
2161/// \param X
2162/// A vector of [4 x i64] to be used by the insert operation.
2163/// \param I
2164/// A 64-bit integer value. The replacement value for the insert operation.
2165/// \param N
2166/// An immediate integer specifying the index of the vector element to be
2167/// replaced.
2168/// \returns A copy of vector \a X, after replacing its element indexed by
2169/// \a N with \a I.
2170#define _mm256_insert_epi64(X, I, N) \
2171 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2172 (long long)(I), (int)(N)))
2173#endif
2174
2175/* Conversion */
2176/// Converts a vector of [4 x i32] into a vector of [4 x double].
2177///
2178/// \headerfile <x86intrin.h>
2179///
2180/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2181///
2182/// \param __a
2183/// A 128-bit integer vector of [4 x i32].
2184/// \returns A 256-bit vector of [4 x double] containing the converted values.
2185static __inline __m256d __DEFAULT_FN_ATTRS
2187{
2188 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2189}
2190
2191/// Converts a vector of [8 x i32] into a vector of [8 x float].
2192///
2193/// \headerfile <x86intrin.h>
2194///
2195/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2196///
2197/// \param __a
2198/// A 256-bit integer vector.
2199/// \returns A 256-bit vector of [8 x float] containing the converted values.
2200static __inline __m256 __DEFAULT_FN_ATTRS
2202{
2203 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2204}
2205
2206/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2207/// [4 x float].
2208///
2209/// \headerfile <x86intrin.h>
2210///
2211/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2212///
2213/// \param __a
2214/// A 256-bit vector of [4 x double].
2215/// \returns A 128-bit vector of [4 x float] containing the converted values.
2216static __inline __m128 __DEFAULT_FN_ATTRS
2218{
2219 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2220}
2221
2222/// Converts a vector of [8 x float] into a vector of [8 x i32].
2223///
2224/// If a converted value does not fit in a 32-bit integer, raises a
2225/// floating-point invalid exception. If the exception is masked, returns
2226/// the most negative integer.
2227///
2228/// \headerfile <x86intrin.h>
2229///
2230/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2231///
2232/// \param __a
2233/// A 256-bit vector of [8 x float].
2234/// \returns A 256-bit integer vector containing the converted values.
2235static __inline __m256i __DEFAULT_FN_ATTRS
2237{
2238 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2239}
2240
2241/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2242/// x double].
2243///
2244/// \headerfile <x86intrin.h>
2245///
2246/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2247///
2248/// \param __a
2249/// A 128-bit vector of [4 x float].
2250/// \returns A 256-bit vector of [4 x double] containing the converted values.
2251static __inline __m256d __DEFAULT_FN_ATTRS
2253{
2254 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2255}
2256
2257/// Converts a 256-bit vector of [4 x double] into four signed truncated
2258/// (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2259/// [4 x i32].
2260///
2261/// If a converted value does not fit in a 32-bit integer, raises a
2262/// floating-point invalid exception. If the exception is masked, returns
2263/// the most negative integer.
2264///
2265/// \headerfile <x86intrin.h>
2266///
2267/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2268///
2269/// \param __a
2270/// A 256-bit vector of [4 x double].
2271/// \returns A 128-bit integer vector containing the converted values.
2272static __inline __m128i __DEFAULT_FN_ATTRS
2274{
2275 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2276}
2277
2278/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2279/// [4 x i32].
2280///
2281/// If a converted value does not fit in a 32-bit integer, raises a
2282/// floating-point invalid exception. If the exception is masked, returns
2283/// the most negative integer.
2284///
2285/// \headerfile <x86intrin.h>
2286///
2287/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2288///
2289/// \param __a
2290/// A 256-bit vector of [4 x double].
2291/// \returns A 128-bit integer vector containing the converted values.
2292static __inline __m128i __DEFAULT_FN_ATTRS
2294{
2295 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2296}
2297
2298/// Converts a vector of [8 x float] into eight signed truncated (rounded
2299/// toward zero) 32-bit integers returned in a vector of [8 x i32].
2300///
2301/// If a converted value does not fit in a 32-bit integer, raises a
2302/// floating-point invalid exception. If the exception is masked, returns
2303/// the most negative integer.
2304///
2305/// \headerfile <x86intrin.h>
2306///
2307/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2308///
2309/// \param __a
2310/// A 256-bit vector of [8 x float].
2311/// \returns A 256-bit integer vector containing the converted values.
2312static __inline __m256i __DEFAULT_FN_ATTRS
2314{
2315 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2316}
2317
2318/// Returns the first element of the input vector of [4 x double].
2319///
2320/// \headerfile <x86intrin.h>
2321///
2322/// This intrinsic is a utility function and does not correspond to a specific
2323/// instruction.
2324///
2325/// \param __a
2326/// A 256-bit vector of [4 x double].
2327/// \returns A 64 bit double containing the first element of the input vector.
2328static __inline double __DEFAULT_FN_ATTRS
2330{
2331 return __a[0];
2332}
2333
2334/// Returns the first element of the input vector of [8 x i32].
2335///
2336/// \headerfile <x86intrin.h>
2337///
2338/// This intrinsic is a utility function and does not correspond to a specific
2339/// instruction.
2340///
2341/// \param __a
2342/// A 256-bit vector of [8 x i32].
2343/// \returns A 32 bit integer containing the first element of the input vector.
2344static __inline int __DEFAULT_FN_ATTRS
2346{
2347 __v8si __b = (__v8si)__a;
2348 return __b[0];
2349}
2350
2351/// Returns the first element of the input vector of [8 x float].
2352///
2353/// \headerfile <x86intrin.h>
2354///
2355/// This intrinsic is a utility function and does not correspond to a specific
2356/// instruction.
2357///
2358/// \param __a
2359/// A 256-bit vector of [8 x float].
2360/// \returns A 32 bit float containing the first element of the input vector.
2361static __inline float __DEFAULT_FN_ATTRS
2363{
2364 return __a[0];
2365}
2366
2367/* Vector replicate */
2368/// Moves and duplicates odd-indexed values from a 256-bit vector of
2369/// [8 x float] to float values in a 256-bit vector of [8 x float].
2370///
2371/// \headerfile <x86intrin.h>
2372///
2373/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2374///
2375/// \param __a
2376/// A 256-bit vector of [8 x float]. \n
2377/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2378/// the return value. \n
2379/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2380/// the return value. \n
2381/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2382/// return value. \n
2383/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2384/// return value.
2385/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2386/// values.
2387static __inline __m256 __DEFAULT_FN_ATTRS
2389{
2390 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2391}
2392
2393/// Moves and duplicates even-indexed values from a 256-bit vector of
2394/// [8 x float] to float values in a 256-bit vector of [8 x float].
2395///
2396/// \headerfile <x86intrin.h>
2397///
2398/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2399///
2400/// \param __a
2401/// A 256-bit vector of [8 x float]. \n
2402/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2403/// the return value. \n
2404/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2405/// the return value. \n
2406/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2407/// return value. \n
2408/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2409/// return value.
2410/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2411/// values.
2412static __inline __m256 __DEFAULT_FN_ATTRS
2414{
2415 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2416}
2417
2418/// Moves and duplicates double-precision floating point values from a
2419/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2420/// vector of [4 x double].
2421///
2422/// \headerfile <x86intrin.h>
2423///
2424/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2425///
2426/// \param __a
2427/// A 256-bit vector of [4 x double]. \n
2428/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2429/// return value. \n
2430/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2431/// the return value.
2432/// \returns A 256-bit vector of [4 x double] containing the moved and
2433/// duplicated values.
2434static __inline __m256d __DEFAULT_FN_ATTRS
2436{
2437 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2438}
2439
2440/* Unpack and Interleave */
2441/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2442/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2443///
2444/// \headerfile <x86intrin.h>
2445///
2446/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2447///
2448/// \param __a
2449/// A 256-bit floating-point vector of [4 x double]. \n
2450/// Bits [127:64] are written to bits [63:0] of the return value. \n
2451/// Bits [255:192] are written to bits [191:128] of the return value. \n
2452/// \param __b
2453/// A 256-bit floating-point vector of [4 x double]. \n
2454/// Bits [127:64] are written to bits [127:64] of the return value. \n
2455/// Bits [255:192] are written to bits [255:192] of the return value. \n
2456/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2457static __inline __m256d __DEFAULT_FN_ATTRS
2458_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2459{
2460 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2461}
2462
2463/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2464/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2465///
2466/// \headerfile <x86intrin.h>
2467///
2468/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2469///
2470/// \param __a
2471/// A 256-bit floating-point vector of [4 x double]. \n
2472/// Bits [63:0] are written to bits [63:0] of the return value. \n
2473/// Bits [191:128] are written to bits [191:128] of the return value.
2474/// \param __b
2475/// A 256-bit floating-point vector of [4 x double]. \n
2476/// Bits [63:0] are written to bits [127:64] of the return value. \n
2477/// Bits [191:128] are written to bits [255:192] of the return value. \n
2478/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2479static __inline __m256d __DEFAULT_FN_ATTRS
2480_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2481{
2482 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2483}
2484
2485/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2486/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2487/// vector of [8 x float].
2488///
2489/// \headerfile <x86intrin.h>
2490///
2491/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2492///
2493/// \param __a
2494/// A 256-bit vector of [8 x float]. \n
2495/// Bits [95:64] are written to bits [31:0] of the return value. \n
2496/// Bits [127:96] are written to bits [95:64] of the return value. \n
2497/// Bits [223:192] are written to bits [159:128] of the return value. \n
2498/// Bits [255:224] are written to bits [223:192] of the return value.
2499/// \param __b
2500/// A 256-bit vector of [8 x float]. \n
2501/// Bits [95:64] are written to bits [63:32] of the return value. \n
2502/// Bits [127:96] are written to bits [127:96] of the return value. \n
2503/// Bits [223:192] are written to bits [191:160] of the return value. \n
2504/// Bits [255:224] are written to bits [255:224] of the return value.
2505/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2506static __inline __m256 __DEFAULT_FN_ATTRS
2508{
2509 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2510}
2511
2512/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2513/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2514/// vector of [8 x float].
2515///
2516/// \headerfile <x86intrin.h>
2517///
2518/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2519///
2520/// \param __a
2521/// A 256-bit vector of [8 x float]. \n
2522/// Bits [31:0] are written to bits [31:0] of the return value. \n
2523/// Bits [63:32] are written to bits [95:64] of the return value. \n
2524/// Bits [159:128] are written to bits [159:128] of the return value. \n
2525/// Bits [191:160] are written to bits [223:192] of the return value.
2526/// \param __b
2527/// A 256-bit vector of [8 x float]. \n
2528/// Bits [31:0] are written to bits [63:32] of the return value. \n
2529/// Bits [63:32] are written to bits [127:96] of the return value. \n
2530/// Bits [159:128] are written to bits [191:160] of the return value. \n
2531/// Bits [191:160] are written to bits [255:224] of the return value.
2532/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2533static __inline __m256 __DEFAULT_FN_ATTRS
2535{
2536 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2537}
2538
2539/* Bit Test */
2540/// Given two 128-bit floating-point vectors of [2 x double], perform an
2541/// element-by-element comparison of the double-precision element in the
2542/// first source vector and the corresponding element in the second source
2543/// vector.
2544///
2545/// The EFLAGS register is updated as follows: \n
2546/// If there is at least one pair of double-precision elements where the
2547/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2548/// ZF flag is set to 1. \n
2549/// If there is at least one pair of double-precision elements where the
2550/// sign-bit of the first element is 0 and the sign-bit of the second element
2551/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2552/// This intrinsic returns the value of the ZF flag.
2553///
2554/// \headerfile <x86intrin.h>
2555///
2556/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2557///
2558/// \param __a
2559/// A 128-bit vector of [2 x double].
2560/// \param __b
2561/// A 128-bit vector of [2 x double].
2562/// \returns the ZF flag in the EFLAGS register.
2563static __inline int __DEFAULT_FN_ATTRS128
2564_mm_testz_pd(__m128d __a, __m128d __b)
2565{
2566 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2567}
2568
2569/// Given two 128-bit floating-point vectors of [2 x double], perform an
2570/// element-by-element comparison of the double-precision element in the
2571/// first source vector and the corresponding element in the second source
2572/// vector.
2573///
2574/// The EFLAGS register is updated as follows: \n
2575/// If there is at least one pair of double-precision elements where the
2576/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2577/// ZF flag is set to 1. \n
2578/// If there is at least one pair of double-precision elements where the
2579/// sign-bit of the first element is 0 and the sign-bit of the second element
2580/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2581/// This intrinsic returns the value of the CF flag.
2582///
2583/// \headerfile <x86intrin.h>
2584///
2585/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2586///
2587/// \param __a
2588/// A 128-bit vector of [2 x double].
2589/// \param __b
2590/// A 128-bit vector of [2 x double].
2591/// \returns the CF flag in the EFLAGS register.
2592static __inline int __DEFAULT_FN_ATTRS128
2593_mm_testc_pd(__m128d __a, __m128d __b)
2594{
2595 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2596}
2597
2598/// Given two 128-bit floating-point vectors of [2 x double], perform an
2599/// element-by-element comparison of the double-precision element in the
2600/// first source vector and the corresponding element in the second source
2601/// vector.
2602///
2603/// The EFLAGS register is updated as follows: \n
2604/// If there is at least one pair of double-precision elements where the
2605/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2606/// ZF flag is set to 1. \n
2607/// If there is at least one pair of double-precision elements where the
2608/// sign-bit of the first element is 0 and the sign-bit of the second element
2609/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2610/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2611/// otherwise it returns 0.
2612///
2613/// \headerfile <x86intrin.h>
2614///
2615/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2616///
2617/// \param __a
2618/// A 128-bit vector of [2 x double].
2619/// \param __b
2620/// A 128-bit vector of [2 x double].
2621/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2622static __inline int __DEFAULT_FN_ATTRS128
2623_mm_testnzc_pd(__m128d __a, __m128d __b)
2624{
2625 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2626}
2627
2628/// Given two 128-bit floating-point vectors of [4 x float], perform an
2629/// element-by-element comparison of the single-precision element in the
2630/// first source vector and the corresponding element in the second source
2631/// vector.
2632///
2633/// The EFLAGS register is updated as follows: \n
2634/// If there is at least one pair of single-precision elements where the
2635/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2636/// ZF flag is set to 1. \n
2637/// If there is at least one pair of single-precision elements where the
2638/// sign-bit of the first element is 0 and the sign-bit of the second element
2639/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2640/// This intrinsic returns the value of the ZF flag.
2641///
2642/// \headerfile <x86intrin.h>
2643///
2644/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2645///
2646/// \param __a
2647/// A 128-bit vector of [4 x float].
2648/// \param __b
2649/// A 128-bit vector of [4 x float].
2650/// \returns the ZF flag.
2651static __inline int __DEFAULT_FN_ATTRS128
2652_mm_testz_ps(__m128 __a, __m128 __b)
2653{
2654 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2655}
2656
2657/// Given two 128-bit floating-point vectors of [4 x float], perform an
2658/// element-by-element comparison of the single-precision element in the
2659/// first source vector and the corresponding element in the second source
2660/// vector.
2661///
2662/// The EFLAGS register is updated as follows: \n
2663/// If there is at least one pair of single-precision elements where the
2664/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2665/// ZF flag is set to 1. \n
2666/// If there is at least one pair of single-precision elements where the
2667/// sign-bit of the first element is 0 and the sign-bit of the second element
2668/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2669/// This intrinsic returns the value of the CF flag.
2670///
2671/// \headerfile <x86intrin.h>
2672///
2673/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2674///
2675/// \param __a
2676/// A 128-bit vector of [4 x float].
2677/// \param __b
2678/// A 128-bit vector of [4 x float].
2679/// \returns the CF flag.
2680static __inline int __DEFAULT_FN_ATTRS128
2681_mm_testc_ps(__m128 __a, __m128 __b)
2682{
2683 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2684}
2685
2686/// Given two 128-bit floating-point vectors of [4 x float], perform an
2687/// element-by-element comparison of the single-precision element in the
2688/// first source vector and the corresponding element in the second source
2689/// vector.
2690///
2691/// The EFLAGS register is updated as follows: \n
2692/// If there is at least one pair of single-precision elements where the
2693/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2694/// ZF flag is set to 1. \n
2695/// If there is at least one pair of single-precision elements where the
2696/// sign-bit of the first element is 0 and the sign-bit of the second element
2697/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2698/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2699/// otherwise it returns 0.
2700///
2701/// \headerfile <x86intrin.h>
2702///
2703/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2704///
2705/// \param __a
2706/// A 128-bit vector of [4 x float].
2707/// \param __b
2708/// A 128-bit vector of [4 x float].
2709/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2710static __inline int __DEFAULT_FN_ATTRS128
2711_mm_testnzc_ps(__m128 __a, __m128 __b)
2712{
2713 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2714}
2715
2716/// Given two 256-bit floating-point vectors of [4 x double], perform an
2717/// element-by-element comparison of the double-precision elements in the
2718/// first source vector and the corresponding elements in the second source
2719/// vector.
2720///
2721/// The EFLAGS register is updated as follows: \n
2722/// If there is at least one pair of double-precision elements where the
2723/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2724/// ZF flag is set to 1. \n
2725/// If there is at least one pair of double-precision elements where the
2726/// sign-bit of the first element is 0 and the sign-bit of the second element
2727/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2728/// This intrinsic returns the value of the ZF flag.
2729///
2730/// \headerfile <x86intrin.h>
2731///
2732/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2733///
2734/// \param __a
2735/// A 256-bit vector of [4 x double].
2736/// \param __b
2737/// A 256-bit vector of [4 x double].
2738/// \returns the ZF flag.
2739static __inline int __DEFAULT_FN_ATTRS
2740_mm256_testz_pd(__m256d __a, __m256d __b)
2741{
2742 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2743}
2744
2745/// Given two 256-bit floating-point vectors of [4 x double], perform an
2746/// element-by-element comparison of the double-precision elements in the
2747/// first source vector and the corresponding elements in the second source
2748/// vector.
2749///
2750/// The EFLAGS register is updated as follows: \n
2751/// If there is at least one pair of double-precision elements where the
2752/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2753/// ZF flag is set to 1. \n
2754/// If there is at least one pair of double-precision elements where the
2755/// sign-bit of the first element is 0 and the sign-bit of the second element
2756/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2757/// This intrinsic returns the value of the CF flag.
2758///
2759/// \headerfile <x86intrin.h>
2760///
2761/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2762///
2763/// \param __a
2764/// A 256-bit vector of [4 x double].
2765/// \param __b
2766/// A 256-bit vector of [4 x double].
2767/// \returns the CF flag.
2768static __inline int __DEFAULT_FN_ATTRS
2769_mm256_testc_pd(__m256d __a, __m256d __b)
2770{
2771 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2772}
2773
2774/// Given two 256-bit floating-point vectors of [4 x double], perform an
2775/// element-by-element comparison of the double-precision elements in the
2776/// first source vector and the corresponding elements in the second source
2777/// vector.
2778///
2779/// The EFLAGS register is updated as follows: \n
2780/// If there is at least one pair of double-precision elements where the
2781/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2782/// ZF flag is set to 1. \n
2783/// If there is at least one pair of double-precision elements where the
2784/// sign-bit of the first element is 0 and the sign-bit of the second element
2785/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2786/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2787/// otherwise it returns 0.
2788///
2789/// \headerfile <x86intrin.h>
2790///
2791/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2792///
2793/// \param __a
2794/// A 256-bit vector of [4 x double].
2795/// \param __b
2796/// A 256-bit vector of [4 x double].
2797/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2798static __inline int __DEFAULT_FN_ATTRS
2799_mm256_testnzc_pd(__m256d __a, __m256d __b)
2800{
2801 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2802}
2803
2804/// Given two 256-bit floating-point vectors of [8 x float], perform an
2805/// element-by-element comparison of the single-precision element in the
2806/// first source vector and the corresponding element in the second source
2807/// vector.
2808///
2809/// The EFLAGS register is updated as follows: \n
2810/// If there is at least one pair of single-precision elements where the
2811/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2812/// ZF flag is set to 1. \n
2813/// If there is at least one pair of single-precision elements where the
2814/// sign-bit of the first element is 0 and the sign-bit of the second element
2815/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2816/// This intrinsic returns the value of the ZF flag.
2817///
2818/// \headerfile <x86intrin.h>
2819///
2820/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2821///
2822/// \param __a
2823/// A 256-bit vector of [8 x float].
2824/// \param __b
2825/// A 256-bit vector of [8 x float].
2826/// \returns the ZF flag.
2827static __inline int __DEFAULT_FN_ATTRS
2828_mm256_testz_ps(__m256 __a, __m256 __b)
2829{
2830 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2831}
2832
2833/// Given two 256-bit floating-point vectors of [8 x float], perform an
2834/// element-by-element comparison of the single-precision element in the
2835/// first source vector and the corresponding element in the second source
2836/// vector.
2837///
2838/// The EFLAGS register is updated as follows: \n
2839/// If there is at least one pair of single-precision elements where the
2840/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2841/// ZF flag is set to 1. \n
2842/// If there is at least one pair of single-precision elements where the
2843/// sign-bit of the first element is 0 and the sign-bit of the second element
2844/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2845/// This intrinsic returns the value of the CF flag.
2846///
2847/// \headerfile <x86intrin.h>
2848///
2849/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2850///
2851/// \param __a
2852/// A 256-bit vector of [8 x float].
2853/// \param __b
2854/// A 256-bit vector of [8 x float].
2855/// \returns the CF flag.
2856static __inline int __DEFAULT_FN_ATTRS
2857_mm256_testc_ps(__m256 __a, __m256 __b)
2858{
2859 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2860}
2861
2862/// Given two 256-bit floating-point vectors of [8 x float], perform an
2863/// element-by-element comparison of the single-precision elements in the
2864/// first source vector and the corresponding elements in the second source
2865/// vector.
2866///
2867/// The EFLAGS register is updated as follows: \n
2868/// If there is at least one pair of single-precision elements where the
2869/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2870/// ZF flag is set to 1. \n
2871/// If there is at least one pair of single-precision elements where the
2872/// sign-bit of the first element is 0 and the sign-bit of the second element
2873/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2874/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2875/// otherwise it returns 0.
2876///
2877/// \headerfile <x86intrin.h>
2878///
2879/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2880///
2881/// \param __a
2882/// A 256-bit vector of [8 x float].
2883/// \param __b
2884/// A 256-bit vector of [8 x float].
2885/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2886static __inline int __DEFAULT_FN_ATTRS
2888{
2889 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2890}
2891
2892/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2893/// of the two source vectors.
2894///
2895/// The EFLAGS register is updated as follows: \n
2896/// If there is at least one pair of bits where both bits are 1, the ZF flag
2897/// is set to 0. Otherwise the ZF flag is set to 1. \n
2898/// If there is at least one pair of bits where the bit from the first source
2899/// vector is 0 and the bit from the second source vector is 1, the CF flag
2900/// is set to 0. Otherwise the CF flag is set to 1. \n
2901/// This intrinsic returns the value of the ZF flag.
2902///
2903/// \headerfile <x86intrin.h>
2904///
2905/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2906///
2907/// \param __a
2908/// A 256-bit integer vector.
2909/// \param __b
2910/// A 256-bit integer vector.
2911/// \returns the ZF flag.
2912static __inline int __DEFAULT_FN_ATTRS
2913_mm256_testz_si256(__m256i __a, __m256i __b)
2914{
2915 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2916}
2917
2918/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2919/// of the two source vectors.
2920///
2921/// The EFLAGS register is updated as follows: \n
2922/// If there is at least one pair of bits where both bits are 1, the ZF flag
2923/// is set to 0. Otherwise the ZF flag is set to 1. \n
2924/// If there is at least one pair of bits where the bit from the first source
2925/// vector is 0 and the bit from the second source vector is 1, the CF flag
2926/// is set to 0. Otherwise the CF flag is set to 1. \n
2927/// This intrinsic returns the value of the CF flag.
2928///
2929/// \headerfile <x86intrin.h>
2930///
2931/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2932///
2933/// \param __a
2934/// A 256-bit integer vector.
2935/// \param __b
2936/// A 256-bit integer vector.
2937/// \returns the CF flag.
2938static __inline int __DEFAULT_FN_ATTRS
2939_mm256_testc_si256(__m256i __a, __m256i __b)
2940{
2941 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2942}
2943
2944/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2945/// of the two source vectors.
2946///
2947/// The EFLAGS register is updated as follows: \n
2948/// If there is at least one pair of bits where both bits are 1, the ZF flag
2949/// is set to 0. Otherwise the ZF flag is set to 1. \n
2950/// If there is at least one pair of bits where the bit from the first source
2951/// vector is 0 and the bit from the second source vector is 1, the CF flag
2952/// is set to 0. Otherwise the CF flag is set to 1. \n
2953/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2954/// otherwise it returns 0.
2955///
2956/// \headerfile <x86intrin.h>
2957///
2958/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2959///
2960/// \param __a
2961/// A 256-bit integer vector.
2962/// \param __b
2963/// A 256-bit integer vector.
2964/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2965static __inline int __DEFAULT_FN_ATTRS
2967{
2968 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2969}
2970
2971/* Vector extract sign mask */
2972/// Extracts the sign bits of double-precision floating point elements
2973/// in a 256-bit vector of [4 x double] and writes them to the lower order
2974/// bits of the return value.
2975///
2976/// \headerfile <x86intrin.h>
2977///
2978/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2979///
2980/// \param __a
2981/// A 256-bit vector of [4 x double] containing the double-precision
2982/// floating point values with sign bits to be extracted.
2983/// \returns The sign bits from the operand, written to bits [3:0].
2984static __inline int __DEFAULT_FN_ATTRS
2986{
2987 return __builtin_ia32_movmskpd256((__v4df)__a);
2988}
2989
2990/// Extracts the sign bits of single-precision floating point elements
2991/// in a 256-bit vector of [8 x float] and writes them to the lower order
2992/// bits of the return value.
2993///
2994/// \headerfile <x86intrin.h>
2995///
2996/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2997///
2998/// \param __a
2999/// A 256-bit vector of [8 x float] containing the single-precision floating
3000/// point values with sign bits to be extracted.
3001/// \returns The sign bits from the operand, written to bits [7:0].
3002static __inline int __DEFAULT_FN_ATTRS
3004{
3005 return __builtin_ia32_movmskps256((__v8sf)__a);
3006}
3007
3008/* Vector __zero */
3009/// Zeroes the contents of all XMM or YMM registers.
3010///
3011/// \headerfile <x86intrin.h>
3012///
3013/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
3014static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3015_mm256_zeroall(void)
3016{
3017 __builtin_ia32_vzeroall();
3018}
3019
3020/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
3021///
3022/// \headerfile <x86intrin.h>
3023///
3024/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3025static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3026_mm256_zeroupper(void)
3027{
3028 __builtin_ia32_vzeroupper();
3029}
3030
3031/* Vector load with broadcast */
3032/// Loads a scalar single-precision floating point value from the
3033/// specified address pointed to by \a __a and broadcasts it to the elements
3034/// of a [4 x float] vector.
3035///
3036/// \headerfile <x86intrin.h>
3037///
3038/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3039///
3040/// \param __a
3041/// The single-precision floating point value to be broadcast.
3042/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3043/// equal to the broadcast value.
3044static __inline __m128 __DEFAULT_FN_ATTRS128
3046{
3047 struct __mm_broadcast_ss_struct {
3048 float __f;
3049 } __attribute__((__packed__, __may_alias__));
3050 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3051 return __extension__ (__m128){ __f, __f, __f, __f };
3052}
3053
3054/// Loads a scalar double-precision floating point value from the
3055/// specified address pointed to by \a __a and broadcasts it to the elements
3056/// of a [4 x double] vector.
3057///
3058/// \headerfile <x86intrin.h>
3059///
3060/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3061///
3062/// \param __a
3063/// The double-precision floating point value to be broadcast.
3064/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3065/// equal to the broadcast value.
3066static __inline __m256d __DEFAULT_FN_ATTRS
3068{
3069 struct __mm256_broadcast_sd_struct {
3070 double __d;
3071 } __attribute__((__packed__, __may_alias__));
3072 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3073 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3074}
3075
3076/// Loads a scalar single-precision floating point value from the
3077/// specified address pointed to by \a __a and broadcasts it to the elements
3078/// of a [8 x float] vector.
3079///
3080/// \headerfile <x86intrin.h>
3081///
3082/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3083///
3084/// \param __a
3085/// The single-precision floating point value to be broadcast.
3086/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3087/// equal to the broadcast value.
3088static __inline __m256 __DEFAULT_FN_ATTRS
3090{
3091 struct __mm256_broadcast_ss_struct {
3092 float __f;
3093 } __attribute__((__packed__, __may_alias__));
3094 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3095 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3096}
3097
3098/// Loads the data from a 128-bit vector of [2 x double] from the
3099/// specified address pointed to by \a __a and broadcasts it to 128-bit
3100/// elements in a 256-bit vector of [4 x double].
3101///
3102/// \headerfile <x86intrin.h>
3103///
3104/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3105///
3106/// \param __a
3107/// The 128-bit vector of [2 x double] to be broadcast.
3108/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3109/// equal to the broadcast value.
3110static __inline __m256d __DEFAULT_FN_ATTRS
3112{
3113 __m128d __b = _mm_loadu_pd((const double *)__a);
3114 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3115 0, 1, 0, 1);
3116}
3117
3118/// Loads the data from a 128-bit vector of [4 x float] from the
3119/// specified address pointed to by \a __a and broadcasts it to 128-bit
3120/// elements in a 256-bit vector of [8 x float].
3121///
3122/// \headerfile <x86intrin.h>
3123///
3124/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3125///
3126/// \param __a
3127/// The 128-bit vector of [4 x float] to be broadcast.
3128/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3129/// equal to the broadcast value.
3130static __inline __m256 __DEFAULT_FN_ATTRS
3132{
3133 __m128 __b = _mm_loadu_ps((const float *)__a);
3134 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3135 0, 1, 2, 3, 0, 1, 2, 3);
3136}
3137
3138/* SIMD load ops */
3139/// Loads 4 double-precision floating point values from a 32-byte aligned
3140/// memory location pointed to by \a __p into a vector of [4 x double].
3141///
3142/// \headerfile <x86intrin.h>
3143///
3144/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3145///
3146/// \param __p
3147/// A 32-byte aligned pointer to a memory location containing
3148/// double-precision floating point values.
3149/// \returns A 256-bit vector of [4 x double] containing the moved values.
3150static __inline __m256d __DEFAULT_FN_ATTRS
3151_mm256_load_pd(double const *__p)
3152{
3153 return *(const __m256d *)__p;
3154}
3155
3156/// Loads 8 single-precision floating point values from a 32-byte aligned
3157/// memory location pointed to by \a __p into a vector of [8 x float].
3158///
3159/// \headerfile <x86intrin.h>
3160///
3161/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3162///
3163/// \param __p
3164/// A 32-byte aligned pointer to a memory location containing float values.
3165/// \returns A 256-bit vector of [8 x float] containing the moved values.
3166static __inline __m256 __DEFAULT_FN_ATTRS
3167_mm256_load_ps(float const *__p)
3168{
3169 return *(const __m256 *)__p;
3170}
3171
3172/// Loads 4 double-precision floating point values from an unaligned
3173/// memory location pointed to by \a __p into a vector of [4 x double].
3174///
3175/// \headerfile <x86intrin.h>
3176///
3177/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3178///
3179/// \param __p
3180/// A pointer to a memory location containing double-precision floating
3181/// point values.
3182/// \returns A 256-bit vector of [4 x double] containing the moved values.
3183static __inline __m256d __DEFAULT_FN_ATTRS
3184_mm256_loadu_pd(double const *__p)
3185{
3186 struct __loadu_pd {
3187 __m256d_u __v;
3188 } __attribute__((__packed__, __may_alias__));
3189 return ((const struct __loadu_pd*)__p)->__v;
3190}
3191
3192/// Loads 8 single-precision floating point values from an unaligned
3193/// memory location pointed to by \a __p into a vector of [8 x float].
3194///
3195/// \headerfile <x86intrin.h>
3196///
3197/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3198///
3199/// \param __p
3200/// A pointer to a memory location containing single-precision floating
3201/// point values.
3202/// \returns A 256-bit vector of [8 x float] containing the moved values.
3203static __inline __m256 __DEFAULT_FN_ATTRS
3205{
3206 struct __loadu_ps {
3207 __m256_u __v;
3208 } __attribute__((__packed__, __may_alias__));
3209 return ((const struct __loadu_ps*)__p)->__v;
3210}
3211
3212/// Loads 256 bits of integer data from a 32-byte aligned memory
3213/// location pointed to by \a __p into elements of a 256-bit integer vector.
3214///
3215/// \headerfile <x86intrin.h>
3216///
3217/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3218///
3219/// \param __p
3220/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3221/// values.
3222/// \returns A 256-bit integer vector containing the moved values.
3223static __inline __m256i __DEFAULT_FN_ATTRS
3224_mm256_load_si256(__m256i const *__p)
3225{
3226 return *__p;
3227}
3228
3229/// Loads 256 bits of integer data from an unaligned memory location
3230/// pointed to by \a __p into a 256-bit integer vector.
3231///
3232/// \headerfile <x86intrin.h>
3233///
3234/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3235///
3236/// \param __p
3237/// A pointer to a 256-bit integer vector containing integer values.
3238/// \returns A 256-bit integer vector containing the moved values.
3239static __inline __m256i __DEFAULT_FN_ATTRS
3240_mm256_loadu_si256(__m256i_u const *__p)
3241{
3242 struct __loadu_si256 {
3243 __m256i_u __v;
3244 } __attribute__((__packed__, __may_alias__));
3245 return ((const struct __loadu_si256*)__p)->__v;
3246}
3247
3248/// Loads 256 bits of integer data from an unaligned memory location
3249/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3250/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3251/// line boundary.
3252///
3253/// \headerfile <x86intrin.h>
3254///
3255/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3256///
3257/// \param __p
3258/// A pointer to a 256-bit integer vector containing integer values.
3259/// \returns A 256-bit integer vector containing the moved values.
3260static __inline __m256i __DEFAULT_FN_ATTRS
3261_mm256_lddqu_si256(__m256i_u const *__p)
3262{
3263 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3264}
3265
3266/* SIMD store ops */
3267/// Stores double-precision floating point values from a 256-bit vector
3268/// of [4 x double] to a 32-byte aligned memory location pointed to by
3269/// \a __p.
3270///
3271/// \headerfile <x86intrin.h>
3272///
3273/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3274///
3275/// \param __p
3276/// A 32-byte aligned pointer to a memory location that will receive the
3277/// double-precision floaing point values.
3278/// \param __a
3279/// A 256-bit vector of [4 x double] containing the values to be moved.
3280static __inline void __DEFAULT_FN_ATTRS
3281_mm256_store_pd(double *__p, __m256d __a)
3282{
3283 *(__m256d *)__p = __a;
3284}
3285
3286/// Stores single-precision floating point values from a 256-bit vector
3287/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3288///
3289/// \headerfile <x86intrin.h>
3290///
3291/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3292///
3293/// \param __p
3294/// A 32-byte aligned pointer to a memory location that will receive the
3295/// float values.
3296/// \param __a
3297/// A 256-bit vector of [8 x float] containing the values to be moved.
3298static __inline void __DEFAULT_FN_ATTRS
3299_mm256_store_ps(float *__p, __m256 __a)
3300{
3301 *(__m256 *)__p = __a;
3302}
3303
3304/// Stores double-precision floating point values from a 256-bit vector
3305/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3306///
3307/// \headerfile <x86intrin.h>
3308///
3309/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3310///
3311/// \param __p
3312/// A pointer to a memory location that will receive the double-precision
3313/// floating point values.
3314/// \param __a
3315/// A 256-bit vector of [4 x double] containing the values to be moved.
3316static __inline void __DEFAULT_FN_ATTRS
3317_mm256_storeu_pd(double *__p, __m256d __a)
3318{
3319 struct __storeu_pd {
3320 __m256d_u __v;
3321 } __attribute__((__packed__, __may_alias__));
3322 ((struct __storeu_pd*)__p)->__v = __a;
3323}
3324
3325/// Stores single-precision floating point values from a 256-bit vector
3326/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3327///
3328/// \headerfile <x86intrin.h>
3329///
3330/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3331///
3332/// \param __p
3333/// A pointer to a memory location that will receive the float values.
3334/// \param __a
3335/// A 256-bit vector of [8 x float] containing the values to be moved.
3336static __inline void __DEFAULT_FN_ATTRS
3337_mm256_storeu_ps(float *__p, __m256 __a)
3338{
3339 struct __storeu_ps {
3340 __m256_u __v;
3341 } __attribute__((__packed__, __may_alias__));
3342 ((struct __storeu_ps*)__p)->__v = __a;
3343}
3344
3345/// Stores integer values from a 256-bit integer vector to a 32-byte
3346/// aligned memory location pointed to by \a __p.
3347///
3348/// \headerfile <x86intrin.h>
3349///
3350/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3351///
3352/// \param __p
3353/// A 32-byte aligned pointer to a memory location that will receive the
3354/// integer values.
3355/// \param __a
3356/// A 256-bit integer vector containing the values to be moved.
3357static __inline void __DEFAULT_FN_ATTRS
3358_mm256_store_si256(__m256i *__p, __m256i __a)
3359{
3360 *__p = __a;
3361}
3362
3363/// Stores integer values from a 256-bit integer vector to an unaligned
3364/// memory location pointed to by \a __p.
3365///
3366/// \headerfile <x86intrin.h>
3367///
3368/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3369///
3370/// \param __p
3371/// A pointer to a memory location that will receive the integer values.
3372/// \param __a
3373/// A 256-bit integer vector containing the values to be moved.
3374static __inline void __DEFAULT_FN_ATTRS
3375_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3376{
3377 struct __storeu_si256 {
3378 __m256i_u __v;
3379 } __attribute__((__packed__, __may_alias__));
3380 ((struct __storeu_si256*)__p)->__v = __a;
3381}
3382
3383/* Conditional load ops */
3384/// Conditionally loads double-precision floating point elements from a
3385/// memory location pointed to by \a __p into a 128-bit vector of
3386/// [2 x double], depending on the mask bits associated with each data
3387/// element.
3388///
3389/// \headerfile <x86intrin.h>
3390///
3391/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3392///
3393/// \param __p
3394/// A pointer to a memory location that contains the double-precision
3395/// floating point values.
3396/// \param __m
3397/// A 128-bit integer vector containing the mask. The most significant bit of
3398/// each data element represents the mask bits. If a mask bit is zero, the
3399/// corresponding value in the memory location is not loaded and the
3400/// corresponding field in the return value is set to zero.
3401/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3402static __inline __m128d __DEFAULT_FN_ATTRS128
3403_mm_maskload_pd(double const *__p, __m128i __m)
3404{
3405 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3406}
3407
3408/// Conditionally loads double-precision floating point elements from a
3409/// memory location pointed to by \a __p into a 256-bit vector of
3410/// [4 x double], depending on the mask bits associated with each data
3411/// element.
3412///
3413/// \headerfile <x86intrin.h>
3414///
3415/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3416///
3417/// \param __p
3418/// A pointer to a memory location that contains the double-precision
3419/// floating point values.
3420/// \param __m
3421/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3422/// significant bit of each quadword element represents the mask bits. If a
3423/// mask bit is zero, the corresponding value in the memory location is not
3424/// loaded and the corresponding field in the return value is set to zero.
3425/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3426static __inline __m256d __DEFAULT_FN_ATTRS
3427_mm256_maskload_pd(double const *__p, __m256i __m)
3428{
3429 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3430 (__v4di)__m);
3431}
3432
3433/// Conditionally loads single-precision floating point elements from a
3434/// memory location pointed to by \a __p into a 128-bit vector of
3435/// [4 x float], depending on the mask bits associated with each data
3436/// element.
3437///
3438/// \headerfile <x86intrin.h>
3439///
3440/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3441///
3442/// \param __p
3443/// A pointer to a memory location that contains the single-precision
3444/// floating point values.
3445/// \param __m
3446/// A 128-bit integer vector containing the mask. The most significant bit of
3447/// each data element represents the mask bits. If a mask bit is zero, the
3448/// corresponding value in the memory location is not loaded and the
3449/// corresponding field in the return value is set to zero.
3450/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3451static __inline __m128 __DEFAULT_FN_ATTRS128
3452_mm_maskload_ps(float const *__p, __m128i __m)
3453{
3454 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3455}
3456
3457/// Conditionally loads single-precision floating point elements from a
3458/// memory location pointed to by \a __p into a 256-bit vector of
3459/// [8 x float], depending on the mask bits associated with each data
3460/// element.
3461///
3462/// \headerfile <x86intrin.h>
3463///
3464/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3465///
3466/// \param __p
3467/// A pointer to a memory location that contains the single-precision
3468/// floating point values.
3469/// \param __m
3470/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3471/// significant bit of each dword element represents the mask bits. If a mask
3472/// bit is zero, the corresponding value in the memory location is not loaded
3473/// and the corresponding field in the return value is set to zero.
3474/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3475static __inline __m256 __DEFAULT_FN_ATTRS
3476_mm256_maskload_ps(float const *__p, __m256i __m)
3477{
3478 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3479}
3480
3481/* Conditional store ops */
3482/// Moves single-precision floating point values from a 256-bit vector
3483/// of [8 x float] to a memory location pointed to by \a __p, according to
3484/// the specified mask.
3485///
3486/// \headerfile <x86intrin.h>
3487///
3488/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3489///
3490/// \param __p
3491/// A pointer to a memory location that will receive the float values.
3492/// \param __m
3493/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3494/// significant bit of each dword element in the mask vector represents the
3495/// mask bits. If a mask bit is zero, the corresponding value from vector
3496/// \a __a is not stored and the corresponding field in the memory location
3497/// pointed to by \a __p is not changed.
3498/// \param __a
3499/// A 256-bit vector of [8 x float] containing the values to be stored.
3500static __inline void __DEFAULT_FN_ATTRS
3501_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3502{
3503 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3504}
3505
3506/// Moves double-precision values from a 128-bit vector of [2 x double]
3507/// to a memory location pointed to by \a __p, according to the specified
3508/// mask.
3509///
3510/// \headerfile <x86intrin.h>
3511///
3512/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3513///
3514/// \param __p
3515/// A pointer to a memory location that will receive the float values.
3516/// \param __m
3517/// A 128-bit integer vector containing the mask. The most significant bit of
3518/// each field in the mask vector represents the mask bits. If a mask bit is
3519/// zero, the corresponding value from vector \a __a is not stored and the
3520/// corresponding field in the memory location pointed to by \a __p is not
3521/// changed.
3522/// \param __a
3523/// A 128-bit vector of [2 x double] containing the values to be stored.
3524static __inline void __DEFAULT_FN_ATTRS128
3525_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3526{
3527 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3528}
3529
3530/// Moves double-precision values from a 256-bit vector of [4 x double]
3531/// to a memory location pointed to by \a __p, according to the specified
3532/// mask.
3533///
3534/// \headerfile <x86intrin.h>
3535///
3536/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3537///
3538/// \param __p
3539/// A pointer to a memory location that will receive the float values.
3540/// \param __m
3541/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3542/// significant bit of each quadword element in the mask vector represents
3543/// the mask bits. If a mask bit is zero, the corresponding value from vector
3544/// __a is not stored and the corresponding field in the memory location
3545/// pointed to by \a __p is not changed.
3546/// \param __a
3547/// A 256-bit vector of [4 x double] containing the values to be stored.
3548static __inline void __DEFAULT_FN_ATTRS
3549_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3550{
3551 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3552}
3553
3554/// Moves single-precision floating point values from a 128-bit vector
3555/// of [4 x float] to a memory location pointed to by \a __p, according to
3556/// the specified mask.
3557///
3558/// \headerfile <x86intrin.h>
3559///
3560/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3561///
3562/// \param __p
3563/// A pointer to a memory location that will receive the float values.
3564/// \param __m
3565/// A 128-bit integer vector containing the mask. The most significant bit of
3566/// each field in the mask vector represents the mask bits. If a mask bit is
3567/// zero, the corresponding value from vector __a is not stored and the
3568/// corresponding field in the memory location pointed to by \a __p is not
3569/// changed.
3570/// \param __a
3571/// A 128-bit vector of [4 x float] containing the values to be stored.
3572static __inline void __DEFAULT_FN_ATTRS128
3573_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3574{
3575 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3576}
3577
3578/* Cacheability support ops */
3579/// Moves integer data from a 256-bit integer vector to a 32-byte
3580/// aligned memory location. To minimize caching, the data is flagged as
3581/// non-temporal (unlikely to be used again soon).
3582///
3583/// \headerfile <x86intrin.h>
3584///
3585/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3586///
3587/// \param __a
3588/// A pointer to a 32-byte aligned memory location that will receive the
3589/// integer values.
3590/// \param __b
3591/// A 256-bit integer vector containing the values to be moved.
3592static __inline void __DEFAULT_FN_ATTRS
3594{
3595 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3596 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3597}
3598
3599/// Moves double-precision values from a 256-bit vector of [4 x double]
3600/// to a 32-byte aligned memory location. To minimize caching, the data is
3601/// flagged as non-temporal (unlikely to be used again soon).
3602///
3603/// \headerfile <x86intrin.h>
3604///
3605/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3606///
3607/// \param __a
3608/// A pointer to a 32-byte aligned memory location that will receive the
3609/// double-precision floating-point values.
3610/// \param __b
3611/// A 256-bit vector of [4 x double] containing the values to be moved.
3612static __inline void __DEFAULT_FN_ATTRS
3613_mm256_stream_pd(void *__a, __m256d __b)
3614{
3615 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3616 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3617}
3618
3619/// Moves single-precision floating point values from a 256-bit vector
3620/// of [8 x float] to a 32-byte aligned memory location. To minimize
3621/// caching, the data is flagged as non-temporal (unlikely to be used again
3622/// soon).
3623///
3624/// \headerfile <x86intrin.h>
3625///
3626/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3627///
3628/// \param __p
3629/// A pointer to a 32-byte aligned memory location that will receive the
3630/// single-precision floating point values.
3631/// \param __a
3632/// A 256-bit vector of [8 x float] containing the values to be moved.
3633static __inline void __DEFAULT_FN_ATTRS
3635{
3636 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3637 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3638}
3639
3640/* Create vectors */
3641/// Create a 256-bit vector of [4 x double] with undefined values.
3642///
3643/// \headerfile <x86intrin.h>
3644///
3645/// This intrinsic has no corresponding instruction.
3646///
3647/// \returns A 256-bit vector of [4 x double] containing undefined values.
3648static __inline__ __m256d __DEFAULT_FN_ATTRS
3650{
3651 return (__m256d)__builtin_ia32_undef256();
3652}
3653
3654/// Create a 256-bit vector of [8 x float] with undefined values.
3655///
3656/// \headerfile <x86intrin.h>
3657///
3658/// This intrinsic has no corresponding instruction.
3659///
3660/// \returns A 256-bit vector of [8 x float] containing undefined values.
3661static __inline__ __m256 __DEFAULT_FN_ATTRS
3663{
3664 return (__m256)__builtin_ia32_undef256();
3665}
3666
3667/// Create a 256-bit integer vector with undefined values.
3668///
3669/// \headerfile <x86intrin.h>
3670///
3671/// This intrinsic has no corresponding instruction.
3672///
3673/// \returns A 256-bit integer vector containing undefined values.
3674static __inline__ __m256i __DEFAULT_FN_ATTRS
3676{
3677 return (__m256i)__builtin_ia32_undef256();
3678}
3679
3680/// Constructs a 256-bit floating-point vector of [4 x double]
3681/// initialized with the specified double-precision floating-point values.
3682///
3683/// \headerfile <x86intrin.h>
3684///
3685/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3686/// instruction.
3687///
3688/// \param __a
3689/// A double-precision floating-point value used to initialize bits [255:192]
3690/// of the result.
3691/// \param __b
3692/// A double-precision floating-point value used to initialize bits [191:128]
3693/// of the result.
3694/// \param __c
3695/// A double-precision floating-point value used to initialize bits [127:64]
3696/// of the result.
3697/// \param __d
3698/// A double-precision floating-point value used to initialize bits [63:0]
3699/// of the result.
3700/// \returns An initialized 256-bit floating-point vector of [4 x double].
3701static __inline __m256d __DEFAULT_FN_ATTRS
3702_mm256_set_pd(double __a, double __b, double __c, double __d)
3703{
3704 return __extension__ (__m256d){ __d, __c, __b, __a };
3705}
3706
3707/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3708/// with the specified single-precision floating-point values.
3709///
3710/// \headerfile <x86intrin.h>
3711///
3712/// This intrinsic is a utility function and does not correspond to a specific
3713/// instruction.
3714///
3715/// \param __a
3716/// A single-precision floating-point value used to initialize bits [255:224]
3717/// of the result.
3718/// \param __b
3719/// A single-precision floating-point value used to initialize bits [223:192]
3720/// of the result.
3721/// \param __c
3722/// A single-precision floating-point value used to initialize bits [191:160]
3723/// of the result.
3724/// \param __d
3725/// A single-precision floating-point value used to initialize bits [159:128]
3726/// of the result.
3727/// \param __e
3728/// A single-precision floating-point value used to initialize bits [127:96]
3729/// of the result.
3730/// \param __f
3731/// A single-precision floating-point value used to initialize bits [95:64]
3732/// of the result.
3733/// \param __g
3734/// A single-precision floating-point value used to initialize bits [63:32]
3735/// of the result.
3736/// \param __h
3737/// A single-precision floating-point value used to initialize bits [31:0]
3738/// of the result.
3739/// \returns An initialized 256-bit floating-point vector of [8 x float].
3740static __inline __m256 __DEFAULT_FN_ATTRS
3741_mm256_set_ps(float __a, float __b, float __c, float __d,
3742 float __e, float __f, float __g, float __h)
3743{
3744 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3745}
3746
3747/// Constructs a 256-bit integer vector initialized with the specified
3748/// 32-bit integral values.
3749///
3750/// \headerfile <x86intrin.h>
3751///
3752/// This intrinsic is a utility function and does not correspond to a specific
3753/// instruction.
3754///
3755/// \param __i0
3756/// A 32-bit integral value used to initialize bits [255:224] of the result.
3757/// \param __i1
3758/// A 32-bit integral value used to initialize bits [223:192] of the result.
3759/// \param __i2
3760/// A 32-bit integral value used to initialize bits [191:160] of the result.
3761/// \param __i3
3762/// A 32-bit integral value used to initialize bits [159:128] of the result.
3763/// \param __i4
3764/// A 32-bit integral value used to initialize bits [127:96] of the result.
3765/// \param __i5
3766/// A 32-bit integral value used to initialize bits [95:64] of the result.
3767/// \param __i6
3768/// A 32-bit integral value used to initialize bits [63:32] of the result.
3769/// \param __i7
3770/// A 32-bit integral value used to initialize bits [31:0] of the result.
3771/// \returns An initialized 256-bit integer vector.
3772static __inline __m256i __DEFAULT_FN_ATTRS
3773_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3774 int __i4, int __i5, int __i6, int __i7)
3775{
3776 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3777}
3778
3779/// Constructs a 256-bit integer vector initialized with the specified
3780/// 16-bit integral values.
3781///
3782/// \headerfile <x86intrin.h>
3783///
3784/// This intrinsic is a utility function and does not correspond to a specific
3785/// instruction.
3786///
3787/// \param __w15
3788/// A 16-bit integral value used to initialize bits [255:240] of the result.
3789/// \param __w14
3790/// A 16-bit integral value used to initialize bits [239:224] of the result.
3791/// \param __w13
3792/// A 16-bit integral value used to initialize bits [223:208] of the result.
3793/// \param __w12
3794/// A 16-bit integral value used to initialize bits [207:192] of the result.
3795/// \param __w11
3796/// A 16-bit integral value used to initialize bits [191:176] of the result.
3797/// \param __w10
3798/// A 16-bit integral value used to initialize bits [175:160] of the result.
3799/// \param __w09
3800/// A 16-bit integral value used to initialize bits [159:144] of the result.
3801/// \param __w08
3802/// A 16-bit integral value used to initialize bits [143:128] of the result.
3803/// \param __w07
3804/// A 16-bit integral value used to initialize bits [127:112] of the result.
3805/// \param __w06
3806/// A 16-bit integral value used to initialize bits [111:96] of the result.
3807/// \param __w05
3808/// A 16-bit integral value used to initialize bits [95:80] of the result.
3809/// \param __w04
3810/// A 16-bit integral value used to initialize bits [79:64] of the result.
3811/// \param __w03
3812/// A 16-bit integral value used to initialize bits [63:48] of the result.
3813/// \param __w02
3814/// A 16-bit integral value used to initialize bits [47:32] of the result.
3815/// \param __w01
3816/// A 16-bit integral value used to initialize bits [31:16] of the result.
3817/// \param __w00
3818/// A 16-bit integral value used to initialize bits [15:0] of the result.
3819/// \returns An initialized 256-bit integer vector.
3820static __inline __m256i __DEFAULT_FN_ATTRS
3821_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3822 short __w11, short __w10, short __w09, short __w08,
3823 short __w07, short __w06, short __w05, short __w04,
3824 short __w03, short __w02, short __w01, short __w00)
3825{
3826 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3827 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3828}
3829
3830/// Constructs a 256-bit integer vector initialized with the specified
3831/// 8-bit integral values.
3832///
3833/// \headerfile <x86intrin.h>
3834///
3835/// This intrinsic is a utility function and does not correspond to a specific
3836/// instruction.
3837///
3838/// \param __b31
3839/// An 8-bit integral value used to initialize bits [255:248] of the result.
3840/// \param __b30
3841/// An 8-bit integral value used to initialize bits [247:240] of the result.
3842/// \param __b29
3843/// An 8-bit integral value used to initialize bits [239:232] of the result.
3844/// \param __b28
3845/// An 8-bit integral value used to initialize bits [231:224] of the result.
3846/// \param __b27
3847/// An 8-bit integral value used to initialize bits [223:216] of the result.
3848/// \param __b26
3849/// An 8-bit integral value used to initialize bits [215:208] of the result.
3850/// \param __b25
3851/// An 8-bit integral value used to initialize bits [207:200] of the result.
3852/// \param __b24
3853/// An 8-bit integral value used to initialize bits [199:192] of the result.
3854/// \param __b23
3855/// An 8-bit integral value used to initialize bits [191:184] of the result.
3856/// \param __b22
3857/// An 8-bit integral value used to initialize bits [183:176] of the result.
3858/// \param __b21
3859/// An 8-bit integral value used to initialize bits [175:168] of the result.
3860/// \param __b20
3861/// An 8-bit integral value used to initialize bits [167:160] of the result.
3862/// \param __b19
3863/// An 8-bit integral value used to initialize bits [159:152] of the result.
3864/// \param __b18
3865/// An 8-bit integral value used to initialize bits [151:144] of the result.
3866/// \param __b17
3867/// An 8-bit integral value used to initialize bits [143:136] of the result.
3868/// \param __b16
3869/// An 8-bit integral value used to initialize bits [135:128] of the result.
3870/// \param __b15
3871/// An 8-bit integral value used to initialize bits [127:120] of the result.
3872/// \param __b14
3873/// An 8-bit integral value used to initialize bits [119:112] of the result.
3874/// \param __b13
3875/// An 8-bit integral value used to initialize bits [111:104] of the result.
3876/// \param __b12
3877/// An 8-bit integral value used to initialize bits [103:96] of the result.
3878/// \param __b11
3879/// An 8-bit integral value used to initialize bits [95:88] of the result.
3880/// \param __b10
3881/// An 8-bit integral value used to initialize bits [87:80] of the result.
3882/// \param __b09
3883/// An 8-bit integral value used to initialize bits [79:72] of the result.
3884/// \param __b08
3885/// An 8-bit integral value used to initialize bits [71:64] of the result.
3886/// \param __b07
3887/// An 8-bit integral value used to initialize bits [63:56] of the result.
3888/// \param __b06
3889/// An 8-bit integral value used to initialize bits [55:48] of the result.
3890/// \param __b05
3891/// An 8-bit integral value used to initialize bits [47:40] of the result.
3892/// \param __b04
3893/// An 8-bit integral value used to initialize bits [39:32] of the result.
3894/// \param __b03
3895/// An 8-bit integral value used to initialize bits [31:24] of the result.
3896/// \param __b02
3897/// An 8-bit integral value used to initialize bits [23:16] of the result.
3898/// \param __b01
3899/// An 8-bit integral value used to initialize bits [15:8] of the result.
3900/// \param __b00
3901/// An 8-bit integral value used to initialize bits [7:0] of the result.
3902/// \returns An initialized 256-bit integer vector.
3903static __inline __m256i __DEFAULT_FN_ATTRS
3904_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3905 char __b27, char __b26, char __b25, char __b24,
3906 char __b23, char __b22, char __b21, char __b20,
3907 char __b19, char __b18, char __b17, char __b16,
3908 char __b15, char __b14, char __b13, char __b12,
3909 char __b11, char __b10, char __b09, char __b08,
3910 char __b07, char __b06, char __b05, char __b04,
3911 char __b03, char __b02, char __b01, char __b00)
3912{
3913 return __extension__ (__m256i)(__v32qi){
3914 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3915 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3916 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3917 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3918 };
3919}
3920
3921/// Constructs a 256-bit integer vector initialized with the specified
3922/// 64-bit integral values.
3923///
3924/// \headerfile <x86intrin.h>
3925///
3926/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3927/// instruction.
3928///
3929/// \param __a
3930/// A 64-bit integral value used to initialize bits [255:192] of the result.
3931/// \param __b
3932/// A 64-bit integral value used to initialize bits [191:128] of the result.
3933/// \param __c
3934/// A 64-bit integral value used to initialize bits [127:64] of the result.
3935/// \param __d
3936/// A 64-bit integral value used to initialize bits [63:0] of the result.
3937/// \returns An initialized 256-bit integer vector.
3938static __inline __m256i __DEFAULT_FN_ATTRS
3939_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3940{
3941 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3942}
3943
3944/* Create vectors with elements in reverse order */
3945/// Constructs a 256-bit floating-point vector of [4 x double],
3946/// initialized in reverse order with the specified double-precision
3947/// floating-point values.
3948///
3949/// \headerfile <x86intrin.h>
3950///
3951/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3952/// instruction.
3953///
3954/// \param __a
3955/// A double-precision floating-point value used to initialize bits [63:0]
3956/// of the result.
3957/// \param __b
3958/// A double-precision floating-point value used to initialize bits [127:64]
3959/// of the result.
3960/// \param __c
3961/// A double-precision floating-point value used to initialize bits [191:128]
3962/// of the result.
3963/// \param __d
3964/// A double-precision floating-point value used to initialize bits [255:192]
3965/// of the result.
3966/// \returns An initialized 256-bit floating-point vector of [4 x double].
3967static __inline __m256d __DEFAULT_FN_ATTRS
3968_mm256_setr_pd(double __a, double __b, double __c, double __d)
3969{
3970 return _mm256_set_pd(__d, __c, __b, __a);
3971}
3972
3973/// Constructs a 256-bit floating-point vector of [8 x float],
3974/// initialized in reverse order with the specified single-precision
3975/// float-point values.
3976///
3977/// \headerfile <x86intrin.h>
3978///
3979/// This intrinsic is a utility function and does not correspond to a specific
3980/// instruction.
3981///
3982/// \param __a
3983/// A single-precision floating-point value used to initialize bits [31:0]
3984/// of the result.
3985/// \param __b
3986/// A single-precision floating-point value used to initialize bits [63:32]
3987/// of the result.
3988/// \param __c
3989/// A single-precision floating-point value used to initialize bits [95:64]
3990/// of the result.
3991/// \param __d
3992/// A single-precision floating-point value used to initialize bits [127:96]
3993/// of the result.
3994/// \param __e
3995/// A single-precision floating-point value used to initialize bits [159:128]
3996/// of the result.
3997/// \param __f
3998/// A single-precision floating-point value used to initialize bits [191:160]
3999/// of the result.
4000/// \param __g
4001/// A single-precision floating-point value used to initialize bits [223:192]
4002/// of the result.
4003/// \param __h
4004/// A single-precision floating-point value used to initialize bits [255:224]
4005/// of the result.
4006/// \returns An initialized 256-bit floating-point vector of [8 x float].
4007static __inline __m256 __DEFAULT_FN_ATTRS
4008_mm256_setr_ps(float __a, float __b, float __c, float __d,
4009 float __e, float __f, float __g, float __h)
4010{
4011 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
4012}
4013
4014/// Constructs a 256-bit integer vector, initialized in reverse order
4015/// with the specified 32-bit integral values.
4016///
4017/// \headerfile <x86intrin.h>
4018///
4019/// This intrinsic is a utility function and does not correspond to a specific
4020/// instruction.
4021///
4022/// \param __i0
4023/// A 32-bit integral value used to initialize bits [31:0] of the result.
4024/// \param __i1
4025/// A 32-bit integral value used to initialize bits [63:32] of the result.
4026/// \param __i2
4027/// A 32-bit integral value used to initialize bits [95:64] of the result.
4028/// \param __i3
4029/// A 32-bit integral value used to initialize bits [127:96] of the result.
4030/// \param __i4
4031/// A 32-bit integral value used to initialize bits [159:128] of the result.
4032/// \param __i5
4033/// A 32-bit integral value used to initialize bits [191:160] of the result.
4034/// \param __i6
4035/// A 32-bit integral value used to initialize bits [223:192] of the result.
4036/// \param __i7
4037/// A 32-bit integral value used to initialize bits [255:224] of the result.
4038/// \returns An initialized 256-bit integer vector.
4039static __inline __m256i __DEFAULT_FN_ATTRS
4040_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4041 int __i4, int __i5, int __i6, int __i7)
4042{
4043 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
4044}
4045
4046/// Constructs a 256-bit integer vector, initialized in reverse order
4047/// with the specified 16-bit integral values.
4048///
4049/// \headerfile <x86intrin.h>
4050///
4051/// This intrinsic is a utility function and does not correspond to a specific
4052/// instruction.
4053///
4054/// \param __w15
4055/// A 16-bit integral value used to initialize bits [15:0] of the result.
4056/// \param __w14
4057/// A 16-bit integral value used to initialize bits [31:16] of the result.
4058/// \param __w13
4059/// A 16-bit integral value used to initialize bits [47:32] of the result.
4060/// \param __w12
4061/// A 16-bit integral value used to initialize bits [63:48] of the result.
4062/// \param __w11
4063/// A 16-bit integral value used to initialize bits [79:64] of the result.
4064/// \param __w10
4065/// A 16-bit integral value used to initialize bits [95:80] of the result.
4066/// \param __w09
4067/// A 16-bit integral value used to initialize bits [111:96] of the result.
4068/// \param __w08
4069/// A 16-bit integral value used to initialize bits [127:112] of the result.
4070/// \param __w07
4071/// A 16-bit integral value used to initialize bits [143:128] of the result.
4072/// \param __w06
4073/// A 16-bit integral value used to initialize bits [159:144] of the result.
4074/// \param __w05
4075/// A 16-bit integral value used to initialize bits [175:160] of the result.
4076/// \param __w04
4077/// A 16-bit integral value used to initialize bits [191:176] of the result.
4078/// \param __w03
4079/// A 16-bit integral value used to initialize bits [207:192] of the result.
4080/// \param __w02
4081/// A 16-bit integral value used to initialize bits [223:208] of the result.
4082/// \param __w01
4083/// A 16-bit integral value used to initialize bits [239:224] of the result.
4084/// \param __w00
4085/// A 16-bit integral value used to initialize bits [255:240] of the result.
4086/// \returns An initialized 256-bit integer vector.
4087static __inline __m256i __DEFAULT_FN_ATTRS
4088_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4089 short __w11, short __w10, short __w09, short __w08,
4090 short __w07, short __w06, short __w05, short __w04,
4091 short __w03, short __w02, short __w01, short __w00)
4092{
4093 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4094 __w04, __w05, __w06, __w07,
4095 __w08, __w09, __w10, __w11,
4096 __w12, __w13, __w14, __w15);
4097}
4098
4099/// Constructs a 256-bit integer vector, initialized in reverse order
4100/// with the specified 8-bit integral values.
4101///
4102/// \headerfile <x86intrin.h>
4103///
4104/// This intrinsic is a utility function and does not correspond to a specific
4105/// instruction.
4106///
4107/// \param __b31
4108/// An 8-bit integral value used to initialize bits [7:0] of the result.
4109/// \param __b30
4110/// An 8-bit integral value used to initialize bits [15:8] of the result.
4111/// \param __b29
4112/// An 8-bit integral value used to initialize bits [23:16] of the result.
4113/// \param __b28
4114/// An 8-bit integral value used to initialize bits [31:24] of the result.
4115/// \param __b27
4116/// An 8-bit integral value used to initialize bits [39:32] of the result.
4117/// \param __b26
4118/// An 8-bit integral value used to initialize bits [47:40] of the result.
4119/// \param __b25
4120/// An 8-bit integral value used to initialize bits [55:48] of the result.
4121/// \param __b24
4122/// An 8-bit integral value used to initialize bits [63:56] of the result.
4123/// \param __b23
4124/// An 8-bit integral value used to initialize bits [71:64] of the result.
4125/// \param __b22
4126/// An 8-bit integral value used to initialize bits [79:72] of the result.
4127/// \param __b21
4128/// An 8-bit integral value used to initialize bits [87:80] of the result.
4129/// \param __b20
4130/// An 8-bit integral value used to initialize bits [95:88] of the result.
4131/// \param __b19
4132/// An 8-bit integral value used to initialize bits [103:96] of the result.
4133/// \param __b18
4134/// An 8-bit integral value used to initialize bits [111:104] of the result.
4135/// \param __b17
4136/// An 8-bit integral value used to initialize bits [119:112] of the result.
4137/// \param __b16
4138/// An 8-bit integral value used to initialize bits [127:120] of the result.
4139/// \param __b15
4140/// An 8-bit integral value used to initialize bits [135:128] of the result.
4141/// \param __b14
4142/// An 8-bit integral value used to initialize bits [143:136] of the result.
4143/// \param __b13
4144/// An 8-bit integral value used to initialize bits [151:144] of the result.
4145/// \param __b12
4146/// An 8-bit integral value used to initialize bits [159:152] of the result.
4147/// \param __b11
4148/// An 8-bit integral value used to initialize bits [167:160] of the result.
4149/// \param __b10
4150/// An 8-bit integral value used to initialize bits [175:168] of the result.
4151/// \param __b09
4152/// An 8-bit integral value used to initialize bits [183:176] of the result.
4153/// \param __b08
4154/// An 8-bit integral value used to initialize bits [191:184] of the result.
4155/// \param __b07
4156/// An 8-bit integral value used to initialize bits [199:192] of the result.
4157/// \param __b06
4158/// An 8-bit integral value used to initialize bits [207:200] of the result.
4159/// \param __b05
4160/// An 8-bit integral value used to initialize bits [215:208] of the result.
4161/// \param __b04
4162/// An 8-bit integral value used to initialize bits [223:216] of the result.
4163/// \param __b03
4164/// An 8-bit integral value used to initialize bits [231:224] of the result.
4165/// \param __b02
4166/// An 8-bit integral value used to initialize bits [239:232] of the result.
4167/// \param __b01
4168/// An 8-bit integral value used to initialize bits [247:240] of the result.
4169/// \param __b00
4170/// An 8-bit integral value used to initialize bits [255:248] of the result.
4171/// \returns An initialized 256-bit integer vector.
4172static __inline __m256i __DEFAULT_FN_ATTRS
4173_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4174 char __b27, char __b26, char __b25, char __b24,
4175 char __b23, char __b22, char __b21, char __b20,
4176 char __b19, char __b18, char __b17, char __b16,
4177 char __b15, char __b14, char __b13, char __b12,
4178 char __b11, char __b10, char __b09, char __b08,
4179 char __b07, char __b06, char __b05, char __b04,
4180 char __b03, char __b02, char __b01, char __b00)
4181{
4182 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4183 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4184 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4185 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4186}
4187
4188/// Constructs a 256-bit integer vector, initialized in reverse order
4189/// with the specified 64-bit integral values.
4190///
4191/// \headerfile <x86intrin.h>
4192///
4193/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4194/// instruction.
4195///
4196/// \param __a
4197/// A 64-bit integral value used to initialize bits [63:0] of the result.
4198/// \param __b
4199/// A 64-bit integral value used to initialize bits [127:64] of the result.
4200/// \param __c
4201/// A 64-bit integral value used to initialize bits [191:128] of the result.
4202/// \param __d
4203/// A 64-bit integral value used to initialize bits [255:192] of the result.
4204/// \returns An initialized 256-bit integer vector.
4205static __inline __m256i __DEFAULT_FN_ATTRS
4206_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4207{
4208 return _mm256_set_epi64x(__d, __c, __b, __a);
4209}
4210
4211/* Create vectors with repeated elements */
4212/// Constructs a 256-bit floating-point vector of [4 x double], with each
4213/// of the four double-precision floating-point vector elements set to the
4214/// specified double-precision floating-point value.
4215///
4216/// \headerfile <x86intrin.h>
4217///
4218/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4219///
4220/// \param __w
4221/// A double-precision floating-point value used to initialize each vector
4222/// element of the result.
4223/// \returns An initialized 256-bit floating-point vector of [4 x double].
4224static __inline __m256d __DEFAULT_FN_ATTRS
4226{
4227 return _mm256_set_pd(__w, __w, __w, __w);
4228}
4229
4230/// Constructs a 256-bit floating-point vector of [8 x float], with each
4231/// of the eight single-precision floating-point vector elements set to the
4232/// specified single-precision floating-point value.
4233///
4234/// \headerfile <x86intrin.h>
4235///
4236/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4237/// instruction.
4238///
4239/// \param __w
4240/// A single-precision floating-point value used to initialize each vector
4241/// element of the result.
4242/// \returns An initialized 256-bit floating-point vector of [8 x float].
4243static __inline __m256 __DEFAULT_FN_ATTRS
4245{
4246 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4247}
4248
4249/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4250/// 32-bit integral vector elements set to the specified 32-bit integral
4251/// value.
4252///
4253/// \headerfile <x86intrin.h>
4254///
4255/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4256/// instruction.
4257///
4258/// \param __i
4259/// A 32-bit integral value used to initialize each vector element of the
4260/// result.
4261/// \returns An initialized 256-bit integer vector of [8 x i32].
4262static __inline __m256i __DEFAULT_FN_ATTRS
4264{
4265 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4266}
4267
4268/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4269/// 16-bit integral vector elements set to the specified 16-bit integral
4270/// value.
4271///
4272/// \headerfile <x86intrin.h>
4273///
4274/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4275///
4276/// \param __w
4277/// A 16-bit integral value used to initialize each vector element of the
4278/// result.
4279/// \returns An initialized 256-bit integer vector of [16 x i16].
4280static __inline __m256i __DEFAULT_FN_ATTRS
4282{
4283 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4284 __w, __w, __w, __w, __w, __w, __w, __w);
4285}
4286
4287/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4288/// 8-bit integral vector elements set to the specified 8-bit integral value.
4289///
4290/// \headerfile <x86intrin.h>
4291///
4292/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4293///
4294/// \param __b
4295/// An 8-bit integral value used to initialize each vector element of the
4296/// result.
4297/// \returns An initialized 256-bit integer vector of [32 x i8].
4298static __inline __m256i __DEFAULT_FN_ATTRS
4300{
4301 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4302 __b, __b, __b, __b, __b, __b, __b, __b,
4303 __b, __b, __b, __b, __b, __b, __b, __b,
4304 __b, __b, __b, __b, __b, __b, __b, __b);
4305}
4306
4307/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4308/// 64-bit integral vector elements set to the specified 64-bit integral
4309/// value.
4310///
4311/// \headerfile <x86intrin.h>
4312///
4313/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4314///
4315/// \param __q
4316/// A 64-bit integral value used to initialize each vector element of the
4317/// result.
4318/// \returns An initialized 256-bit integer vector of [4 x i64].
4319static __inline __m256i __DEFAULT_FN_ATTRS
4321{
4322 return _mm256_set_epi64x(__q, __q, __q, __q);
4323}
4324
4325/* Create __zeroed vectors */
4326/// Constructs a 256-bit floating-point vector of [4 x double] with all
4327/// vector elements initialized to zero.
4328///
4329/// \headerfile <x86intrin.h>
4330///
4331/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4332///
4333/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4334static __inline __m256d __DEFAULT_FN_ATTRS
4336{
4337 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
4338}
4339
4340/// Constructs a 256-bit floating-point vector of [8 x float] with all
4341/// vector elements initialized to zero.
4342///
4343/// \headerfile <x86intrin.h>
4344///
4345/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4346///
4347/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4348static __inline __m256 __DEFAULT_FN_ATTRS
4350{
4351 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4352}
4353
4354/// Constructs a 256-bit integer vector initialized to zero.
4355///
4356/// \headerfile <x86intrin.h>
4357///
4358/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4359///
4360/// \returns A 256-bit integer vector initialized to zero.
4361static __inline __m256i __DEFAULT_FN_ATTRS
4363{
4364 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4365}
4366
4367/* Cast between vector types */
4368/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4369/// floating-point vector of [8 x float].
4370///
4371/// \headerfile <x86intrin.h>
4372///
4373/// This intrinsic has no corresponding instruction.
4374///
4375/// \param __a
4376/// A 256-bit floating-point vector of [4 x double].
4377/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4378/// bitwise pattern as the parameter.
4379static __inline __m256 __DEFAULT_FN_ATTRS
4381{
4382 return (__m256)__a;
4383}
4384
4385/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4386/// integer vector.
4387///
4388/// \headerfile <x86intrin.h>
4389///
4390/// This intrinsic has no corresponding instruction.
4391///
4392/// \param __a
4393/// A 256-bit floating-point vector of [4 x double].
4394/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4395/// parameter.
4396static __inline __m256i __DEFAULT_FN_ATTRS
4398{
4399 return (__m256i)__a;
4400}
4401
4402/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4403/// floating-point vector of [4 x double].
4404///
4405/// \headerfile <x86intrin.h>
4406///
4407/// This intrinsic has no corresponding instruction.
4408///
4409/// \param __a
4410/// A 256-bit floating-point vector of [8 x float].
4411/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4412/// bitwise pattern as the parameter.
4413static __inline __m256d __DEFAULT_FN_ATTRS
4415{
4416 return (__m256d)__a;
4417}
4418
4419/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4420/// integer vector.
4421///
4422/// \headerfile <x86intrin.h>
4423///
4424/// This intrinsic has no corresponding instruction.
4425///
4426/// \param __a
4427/// A 256-bit floating-point vector of [8 x float].
4428/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4429/// parameter.
4430static __inline __m256i __DEFAULT_FN_ATTRS
4432{
4433 return (__m256i)__a;
4434}
4435
4436/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4437/// of [8 x float].
4438///
4439/// \headerfile <x86intrin.h>
4440///
4441/// This intrinsic has no corresponding instruction.
4442///
4443/// \param __a
4444/// A 256-bit integer vector.
4445/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4446/// bitwise pattern as the parameter.
4447static __inline __m256 __DEFAULT_FN_ATTRS
4449{
4450 return (__m256)__a;
4451}
4452
4453/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4454/// of [4 x double].
4455///
4456/// \headerfile <x86intrin.h>
4457///
4458/// This intrinsic has no corresponding instruction.
4459///
4460/// \param __a
4461/// A 256-bit integer vector.
4462/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4463/// bitwise pattern as the parameter.
4464static __inline __m256d __DEFAULT_FN_ATTRS
4466{
4467 return (__m256d)__a;
4468}
4469
4470/// Returns the lower 128 bits of a 256-bit floating-point vector of
4471/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4472///
4473/// \headerfile <x86intrin.h>
4474///
4475/// This intrinsic has no corresponding instruction.
4476///
4477/// \param __a
4478/// A 256-bit floating-point vector of [4 x double].
4479/// \returns A 128-bit floating-point vector of [2 x double] containing the
4480/// lower 128 bits of the parameter.
4481static __inline __m128d __DEFAULT_FN_ATTRS
4483{
4484 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4485}
4486
4487/// Returns the lower 128 bits of a 256-bit floating-point vector of
4488/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4489///
4490/// \headerfile <x86intrin.h>
4491///
4492/// This intrinsic has no corresponding instruction.
4493///
4494/// \param __a
4495/// A 256-bit floating-point vector of [8 x float].
4496/// \returns A 128-bit floating-point vector of [4 x float] containing the
4497/// lower 128 bits of the parameter.
4498static __inline __m128 __DEFAULT_FN_ATTRS
4500{
4501 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4502}
4503
4504/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4505///
4506/// \headerfile <x86intrin.h>
4507///
4508/// This intrinsic has no corresponding instruction.
4509///
4510/// \param __a
4511/// A 256-bit integer vector.
4512/// \returns A 128-bit integer vector containing the lower 128 bits of the
4513/// parameter.
4514static __inline __m128i __DEFAULT_FN_ATTRS
4516{
4517 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4518}
4519
4520/// Constructs a 256-bit floating-point vector of [4 x double] from a
4521/// 128-bit floating-point vector of [2 x double].
4522///
4523/// The lower 128 bits contain the value of the source vector. The contents
4524/// of the upper 128 bits are undefined.
4525///
4526/// \headerfile <x86intrin.h>
4527///
4528/// This intrinsic has no corresponding instruction.
4529///
4530/// \param __a
4531/// A 128-bit vector of [2 x double].
4532/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4533/// contain the value of the parameter. The contents of the upper 128 bits
4534/// are undefined.
4535static __inline __m256d __DEFAULT_FN_ATTRS
4537{
4538 return __builtin_shufflevector(
4539 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4540}
4541
4542/// Constructs a 256-bit floating-point vector of [8 x float] from a
4543/// 128-bit floating-point vector of [4 x float].
4544///
4545/// The lower 128 bits contain the value of the source vector. The contents
4546/// of the upper 128 bits are undefined.
4547///
4548/// \headerfile <x86intrin.h>
4549///
4550/// This intrinsic has no corresponding instruction.
4551///
4552/// \param __a
4553/// A 128-bit vector of [4 x float].
4554/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4555/// contain the value of the parameter. The contents of the upper 128 bits
4556/// are undefined.
4557static __inline __m256 __DEFAULT_FN_ATTRS
4559{
4560 return __builtin_shufflevector((__v4sf)__a,
4561 (__v4sf)__builtin_nondeterministic_value(__a),
4562 0, 1, 2, 3, 4, 5, 6, 7);
4563}
4564
4565/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4566///
4567/// The lower 128 bits contain the value of the source vector. The contents
4568/// of the upper 128 bits are undefined.
4569///
4570/// \headerfile <x86intrin.h>
4571///
4572/// This intrinsic has no corresponding instruction.
4573///
4574/// \param __a
4575/// A 128-bit integer vector.
4576/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4577/// the parameter. The contents of the upper 128 bits are undefined.
4578static __inline __m256i __DEFAULT_FN_ATTRS
4580{
4581 return __builtin_shufflevector(
4582 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4583}
4584
4585/// Constructs a 256-bit floating-point vector of [4 x double] from a
4586/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4587/// contain the value of the source vector. The upper 128 bits are set
4588/// to zero.
4589///
4590/// \headerfile <x86intrin.h>
4591///
4592/// This intrinsic has no corresponding instruction.
4593///
4594/// \param __a
4595/// A 128-bit vector of [2 x double].
4596/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4597/// contain the value of the parameter. The upper 128 bits are set to zero.
4598static __inline __m256d __DEFAULT_FN_ATTRS
4600{
4601 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4602}
4603
4604/// Constructs a 256-bit floating-point vector of [8 x float] from a
4605/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4606/// the value of the source vector. The upper 128 bits are set to zero.
4607///
4608/// \headerfile <x86intrin.h>
4609///
4610/// This intrinsic has no corresponding instruction.
4611///
4612/// \param __a
4613/// A 128-bit vector of [4 x float].
4614/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4615/// contain the value of the parameter. The upper 128 bits are set to zero.
4616static __inline __m256 __DEFAULT_FN_ATTRS
4618{
4619 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4620}
4621
4622/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4623/// The lower 128 bits contain the value of the source vector. The upper
4624/// 128 bits are set to zero.
4625///
4626/// \headerfile <x86intrin.h>
4627///
4628/// This intrinsic has no corresponding instruction.
4629///
4630/// \param __a
4631/// A 128-bit integer vector.
4632/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4633/// the parameter. The upper 128 bits are set to zero.
4634static __inline __m256i __DEFAULT_FN_ATTRS
4636{
4637 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4638}
4639
4640/*
4641 Vector insert.
4642 We use macros rather than inlines because we only want to accept
4643 invocations where the immediate M is a constant expression.
4644*/
4645/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4646/// a 256-bit vector of [8 x float] given in the first parameter, and then
4647/// replacing either the upper or the lower 128 bits with the contents of a
4648/// 128-bit vector of [4 x float] in the second parameter.
4649///
4650/// The immediate integer parameter determines between the upper or the lower
4651/// 128 bits.
4652///
4653/// \headerfile <x86intrin.h>
4654///
4655/// \code
4656/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4657/// \endcode
4658///
4659/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4660///
4661/// \param V1
4662/// A 256-bit vector of [8 x float]. This vector is copied to the result
4663/// first, and then either the upper or the lower 128 bits of the result will
4664/// be replaced by the contents of \a V2.
4665/// \param V2
4666/// A 128-bit vector of [4 x float]. The contents of this parameter are
4667/// written to either the upper or the lower 128 bits of the result depending
4668/// on the value of parameter \a M.
4669/// \param M
4670/// An immediate integer. The least significant bit determines how the values
4671/// from the two parameters are interleaved: \n
4672/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4673/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4674/// result. \n
4675/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4676/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4677/// result.
4678/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4679#define _mm256_insertf128_ps(V1, V2, M) \
4680 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4681 (__v4sf)(__m128)(V2), (int)(M)))
4682
4683/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4684/// a 256-bit vector of [4 x double] given in the first parameter, and then
4685/// replacing either the upper or the lower 128 bits with the contents of a
4686/// 128-bit vector of [2 x double] in the second parameter.
4687///
4688/// The immediate integer parameter determines between the upper or the lower
4689/// 128 bits.
4690///
4691/// \headerfile <x86intrin.h>
4692///
4693/// \code
4694/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4695/// \endcode
4696///
4697/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4698///
4699/// \param V1
4700/// A 256-bit vector of [4 x double]. This vector is copied to the result
4701/// first, and then either the upper or the lower 128 bits of the result will
4702/// be replaced by the contents of \a V2.
4703/// \param V2
4704/// A 128-bit vector of [2 x double]. The contents of this parameter are
4705/// written to either the upper or the lower 128 bits of the result depending
4706/// on the value of parameter \a M.
4707/// \param M
4708/// An immediate integer. The least significant bit determines how the values
4709/// from the two parameters are interleaved: \n
4710/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4711/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4712/// result. \n
4713/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4714/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4715/// result.
4716/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4717#define _mm256_insertf128_pd(V1, V2, M) \
4718 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4719 (__v2df)(__m128d)(V2), (int)(M)))
4720
4721/// Constructs a new 256-bit integer vector by first duplicating a
4722/// 256-bit integer vector given in the first parameter, and then replacing
4723/// either the upper or the lower 128 bits with the contents of a 128-bit
4724/// integer vector in the second parameter.
4725///
4726/// The immediate integer parameter determines between the upper or the lower
4727/// 128 bits.
4728///
4729/// \headerfile <x86intrin.h>
4730///
4731/// \code
4732/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4733/// \endcode
4734///
4735/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4736///
4737/// \param V1
4738/// A 256-bit integer vector. This vector is copied to the result first, and
4739/// then either the upper or the lower 128 bits of the result will be
4740/// replaced by the contents of \a V2.
4741/// \param V2
4742/// A 128-bit integer vector. The contents of this parameter are written to
4743/// either the upper or the lower 128 bits of the result depending on the
4744/// value of parameter \a M.
4745/// \param M
4746/// An immediate integer. The least significant bit determines how the values
4747/// from the two parameters are interleaved: \n
4748/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4749/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4750/// result. \n
4751/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4752/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4753/// result.
4754/// \returns A 256-bit integer vector containing the interleaved values.
4755#define _mm256_insertf128_si256(V1, V2, M) \
4756 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4757 (__v4si)(__m128i)(V2), (int)(M)))
4758
4759/*
4760 Vector extract.
4761 We use macros rather than inlines because we only want to accept
4762 invocations where the immediate M is a constant expression.
4763*/
4764/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4765/// of [8 x float], as determined by the immediate integer parameter, and
4766/// returns the extracted bits as a 128-bit vector of [4 x float].
4767///
4768/// \headerfile <x86intrin.h>
4769///
4770/// \code
4771/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4772/// \endcode
4773///
4774/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4775///
4776/// \param V
4777/// A 256-bit vector of [8 x float].
4778/// \param M
4779/// An immediate integer. The least significant bit determines which bits are
4780/// extracted from the first parameter: \n
4781/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4782/// result. \n
4783/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4784/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4785#define _mm256_extractf128_ps(V, M) \
4786 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4787
4788/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4789/// of [4 x double], as determined by the immediate integer parameter, and
4790/// returns the extracted bits as a 128-bit vector of [2 x double].
4791///
4792/// \headerfile <x86intrin.h>
4793///
4794/// \code
4795/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4796/// \endcode
4797///
4798/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4799///
4800/// \param V
4801/// A 256-bit vector of [4 x double].
4802/// \param M
4803/// An immediate integer. The least significant bit determines which bits are
4804/// extracted from the first parameter: \n
4805/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4806/// result. \n
4807/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4808/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4809#define _mm256_extractf128_pd(V, M) \
4810 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4811
4812/// Extracts either the upper or the lower 128 bits from a 256-bit
4813/// integer vector, as determined by the immediate integer parameter, and
4814/// returns the extracted bits as a 128-bit integer vector.
4815///
4816/// \headerfile <x86intrin.h>
4817///
4818/// \code
4819/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4820/// \endcode
4821///
4822/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4823///
4824/// \param V
4825/// A 256-bit integer vector.
4826/// \param M
4827/// An immediate integer. The least significant bit determines which bits are
4828/// extracted from the first parameter: \n
4829/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4830/// result. \n
4831/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4832/// \returns A 128-bit integer vector containing the extracted bits.
4833#define _mm256_extractf128_si256(V, M) \
4834 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4835
4836/// Constructs a 256-bit floating-point vector of [8 x float] by
4837/// concatenating two 128-bit floating-point vectors of [4 x float].
4838///
4839/// \headerfile <x86intrin.h>
4840///
4841/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4842///
4843/// \param __hi
4844/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4845/// 128 bits of the result.
4846/// \param __lo
4847/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4848/// 128 bits of the result.
4849/// \returns A 256-bit floating-point vector of [8 x float] containing the
4850/// concatenated result.
4851static __inline __m256 __DEFAULT_FN_ATTRS
4852_mm256_set_m128 (__m128 __hi, __m128 __lo)
4853{
4854 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4855}
4856
4857/// Constructs a 256-bit floating-point vector of [4 x double] by
4858/// concatenating two 128-bit floating-point vectors of [2 x double].
4859///
4860/// \headerfile <x86intrin.h>
4861///
4862/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4863///
4864/// \param __hi
4865/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4866/// 128 bits of the result.
4867/// \param __lo
4868/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4869/// 128 bits of the result.
4870/// \returns A 256-bit floating-point vector of [4 x double] containing the
4871/// concatenated result.
4872static __inline __m256d __DEFAULT_FN_ATTRS
4873_mm256_set_m128d (__m128d __hi, __m128d __lo)
4874{
4875 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4876}
4877
4878/// Constructs a 256-bit integer vector by concatenating two 128-bit
4879/// integer vectors.
4880///
4881/// \headerfile <x86intrin.h>
4882///
4883/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4884///
4885/// \param __hi
4886/// A 128-bit integer vector to be copied to the upper 128 bits of the
4887/// result.
4888/// \param __lo
4889/// A 128-bit integer vector to be copied to the lower 128 bits of the
4890/// result.
4891/// \returns A 256-bit integer vector containing the concatenated result.
4892static __inline __m256i __DEFAULT_FN_ATTRS
4893_mm256_set_m128i (__m128i __hi, __m128i __lo)
4894{
4895 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4896}
4897
4898/// Constructs a 256-bit floating-point vector of [8 x float] by
4899/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4900/// similar to _mm256_set_m128, but the order of the input parameters is
4901/// swapped.
4902///
4903/// \headerfile <x86intrin.h>
4904///
4905/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4906///
4907/// \param __lo
4908/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4909/// 128 bits of the result.
4910/// \param __hi
4911/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4912/// 128 bits of the result.
4913/// \returns A 256-bit floating-point vector of [8 x float] containing the
4914/// concatenated result.
4915static __inline __m256 __DEFAULT_FN_ATTRS
4916_mm256_setr_m128 (__m128 __lo, __m128 __hi)
4917{
4918 return _mm256_set_m128(__hi, __lo);
4919}
4920
4921/// Constructs a 256-bit floating-point vector of [4 x double] by
4922/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4923/// similar to _mm256_set_m128d, but the order of the input parameters is
4924/// swapped.
4925///
4926/// \headerfile <x86intrin.h>
4927///
4928/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4929///
4930/// \param __lo
4931/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4932/// 128 bits of the result.
4933/// \param __hi
4934/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4935/// 128 bits of the result.
4936/// \returns A 256-bit floating-point vector of [4 x double] containing the
4937/// concatenated result.
4938static __inline __m256d __DEFAULT_FN_ATTRS
4939_mm256_setr_m128d (__m128d __lo, __m128d __hi)
4940{
4941 return (__m256d)_mm256_set_m128d(__hi, __lo);
4942}
4943
4944/// Constructs a 256-bit integer vector by concatenating two 128-bit
4945/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4946/// the input parameters is swapped.
4947///
4948/// \headerfile <x86intrin.h>
4949///
4950/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4951///
4952/// \param __lo
4953/// A 128-bit integer vector to be copied to the lower 128 bits of the
4954/// result.
4955/// \param __hi
4956/// A 128-bit integer vector to be copied to the upper 128 bits of the
4957/// result.
4958/// \returns A 256-bit integer vector containing the concatenated result.
4959static __inline __m256i __DEFAULT_FN_ATTRS
4960_mm256_setr_m128i (__m128i __lo, __m128i __hi)
4961{
4962 return (__m256i)_mm256_set_m128i(__hi, __lo);
4963}
4964
4965/* SIMD load ops (unaligned) */
4966/// Loads two 128-bit floating-point vectors of [4 x float] from
4967/// unaligned memory locations and constructs a 256-bit floating-point vector
4968/// of [8 x float] by concatenating the two 128-bit vectors.
4969///
4970/// \headerfile <x86intrin.h>
4971///
4972/// This intrinsic corresponds to load instructions followed by the
4973/// <c> VINSERTF128 </c> instruction.
4974///
4975/// \param __addr_hi
4976/// A pointer to a 128-bit memory location containing 4 consecutive
4977/// single-precision floating-point values. These values are to be copied to
4978/// bits[255:128] of the result. The address of the memory location does not
4979/// have to be aligned.
4980/// \param __addr_lo
4981/// A pointer to a 128-bit memory location containing 4 consecutive
4982/// single-precision floating-point values. These values are to be copied to
4983/// bits[127:0] of the result. The address of the memory location does not
4984/// have to be aligned.
4985/// \returns A 256-bit floating-point vector of [8 x float] containing the
4986/// concatenated result.
4987static __inline __m256 __DEFAULT_FN_ATTRS
4988_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4989{
4990 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4991}
4992
4993/// Loads two 128-bit floating-point vectors of [2 x double] from
4994/// unaligned memory locations and constructs a 256-bit floating-point vector
4995/// of [4 x double] by concatenating the two 128-bit vectors.
4996///
4997/// \headerfile <x86intrin.h>
4998///
4999/// This intrinsic corresponds to load instructions followed by the
5000/// <c> VINSERTF128 </c> instruction.
5001///
5002/// \param __addr_hi
5003/// A pointer to a 128-bit memory location containing two consecutive
5004/// double-precision floating-point values. These values are to be copied to
5005/// bits[255:128] of the result. The address of the memory location does not
5006/// have to be aligned.
5007/// \param __addr_lo
5008/// A pointer to a 128-bit memory location containing two consecutive
5009/// double-precision floating-point values. These values are to be copied to
5010/// bits[127:0] of the result. The address of the memory location does not
5011/// have to be aligned.
5012/// \returns A 256-bit floating-point vector of [4 x double] containing the
5013/// concatenated result.
5014static __inline __m256d __DEFAULT_FN_ATTRS
5015_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
5016{
5017 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
5018}
5019
5020/// Loads two 128-bit integer vectors from unaligned memory locations and
5021/// constructs a 256-bit integer vector by concatenating the two 128-bit
5022/// vectors.
5023///
5024/// \headerfile <x86intrin.h>
5025///
5026/// This intrinsic corresponds to load instructions followed by the
5027/// <c> VINSERTF128 </c> instruction.
5028///
5029/// \param __addr_hi
5030/// A pointer to a 128-bit memory location containing a 128-bit integer
5031/// vector. This vector is to be copied to bits[255:128] of the result. The
5032/// address of the memory location does not have to be aligned.
5033/// \param __addr_lo
5034/// A pointer to a 128-bit memory location containing a 128-bit integer
5035/// vector. This vector is to be copied to bits[127:0] of the result. The
5036/// address of the memory location does not have to be aligned.
5037/// \returns A 256-bit integer vector containing the concatenated result.
5038static __inline __m256i __DEFAULT_FN_ATTRS
5039_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
5040{
5041 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
5042}
5043
5044/* SIMD store ops (unaligned) */
5045/// Stores the upper and lower 128 bits of a 256-bit floating-point
5046/// vector of [8 x float] into two different unaligned memory locations.
5047///
5048/// \headerfile <x86intrin.h>
5049///
5050/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5051/// store instructions.
5052///
5053/// \param __addr_hi
5054/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5055/// copied to this memory location. The address of this memory location does
5056/// not have to be aligned.
5057/// \param __addr_lo
5058/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5059/// copied to this memory location. The address of this memory location does
5060/// not have to be aligned.
5061/// \param __a
5062/// A 256-bit floating-point vector of [8 x float].
5063static __inline void __DEFAULT_FN_ATTRS
5064_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
5065{
5066 __m128 __v128;
5067
5068 __v128 = _mm256_castps256_ps128(__a);
5069 _mm_storeu_ps(__addr_lo, __v128);
5070 __v128 = _mm256_extractf128_ps(__a, 1);
5071 _mm_storeu_ps(__addr_hi, __v128);
5072}
5073
5074/// Stores the upper and lower 128 bits of a 256-bit floating-point
5075/// vector of [4 x double] into two different unaligned memory locations.
5076///
5077/// \headerfile <x86intrin.h>
5078///
5079/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5080/// store instructions.
5081///
5082/// \param __addr_hi
5083/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5084/// copied to this memory location. The address of this memory location does
5085/// not have to be aligned.
5086/// \param __addr_lo
5087/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5088/// copied to this memory location. The address of this memory location does
5089/// not have to be aligned.
5090/// \param __a
5091/// A 256-bit floating-point vector of [4 x double].
5092static __inline void __DEFAULT_FN_ATTRS
5093_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5094{
5095 __m128d __v128;
5096
5097 __v128 = _mm256_castpd256_pd128(__a);
5098 _mm_storeu_pd(__addr_lo, __v128);
5099 __v128 = _mm256_extractf128_pd(__a, 1);
5100 _mm_storeu_pd(__addr_hi, __v128);
5101}
5102
5103/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5104/// two different unaligned memory locations.
5105///
5106/// \headerfile <x86intrin.h>
5107///
5108/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5109/// store instructions.
5110///
5111/// \param __addr_hi
5112/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5113/// copied to this memory location. The address of this memory location does
5114/// not have to be aligned.
5115/// \param __addr_lo
5116/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5117/// copied to this memory location. The address of this memory location does
5118/// not have to be aligned.
5119/// \param __a
5120/// A 256-bit integer vector.
5121static __inline void __DEFAULT_FN_ATTRS
5122_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5123{
5124 __m128i __v128;
5125
5126 __v128 = _mm256_castsi256_si128(__a);
5127 _mm_storeu_si128(__addr_lo, __v128);
5128 __v128 = _mm256_extractf128_si256(__a, 1);
5129 _mm_storeu_si128(__addr_hi, __v128);
5130}
5131
5132#undef __DEFAULT_FN_ATTRS
5133#undef __DEFAULT_FN_ATTRS128
5134
5135#endif /* __AVXINTRIN_H */
__device__ _Float16
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:88
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3067
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4852
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition: avxintrin.h:3111
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition: avxintrin.h:753
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition: avxintrin.h:3317
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2939
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition: avxintrin.h:101
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition: avxintrin.h:3613
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition: avxintrin.h:4448
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition: avxintrin.h:3131
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: avxintrin.h:4173
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition: avxintrin.h:2293
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition: avxintrin.h:3281
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4635
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition: avxintrin.h:3337
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition: avxintrin.h:2480
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition: avxintrin.h:4988
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:365
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3427
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition: avxintrin.h:4244
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition: avxintrin.h:4431
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:665
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition: avxintrin.h:2201
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:399
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition: avxintrin.h:2413
static __inline __m128d __DEFAULT_FN_ATTRS _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition: avxintrin.h:4482
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition: avxintrin.h:2985
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition: avxintrin.h:2217
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition: avxintrin.h:4349
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition: avxintrin.h:3662
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2887
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition: avxintrin.h:4499
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3452
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3403
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition: avxintrin.h:4299
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition: avxintrin.h:991
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition: avxintrin.h:2252
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition: avxintrin.h:3375
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition: avxintrin.h:4785
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition: avxintrin.h:4833
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition: avxintrin.h:3224
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2966
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition: avxintrin.h:1415
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4873
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition: avxintrin.h:4380
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition: avxintrin.h:4225
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition: avxintrin.h:3593
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition: avxintrin.h:900
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition: avxintrin.h:3649
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:382
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: avxintrin.h:4088
#define __DEFAULT_FN_ATTRS
Definition: avxintrin.h:61
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition: avxintrin.h:3525
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4536
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into four signed truncated (rounded toward zero) 32-bit int...
Definition: avxintrin.h:2273
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition: avxintrin.h:3675
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition: avxintrin.h:4008
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition: avxintrin.h:2236
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition: avxintrin.h:4320
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition: avxintrin.h:4465
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2623
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition: avxintrin.h:295
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition: avxintrin.h:3702
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2857
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:3167
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:683
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition: avxintrin.h:4809
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into eight signed truncated (rounded toward zero) 32-bit integers re...
Definition: avxintrin.h:2313
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4558
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition: avxintrin.h:4960
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition: avxintrin.h:137
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition: avxintrin.h:3939
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3089
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition: avxintrin.h:253
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2799
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition: avxintrin.h:5015
static __inline float __DEFAULT_FN_ATTRS _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition: avxintrin.h:2362
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition: avxintrin.h:2388
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition: avxintrin.h:193
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2711
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition: avxintrin.h:5093
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition: avxintrin.h:348
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition: avxintrin.h:156
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition: avxintrin.h:730
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition: avxintrin.h:175
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2740
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition: avxintrin.h:313
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:569
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition: avxintrin.h:3549
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition: avxintrin.h:3501
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:647
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:3184
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2913
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4599
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: avxintrin.h:4206
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition: avxintrin.h:4414
static __inline double __DEFAULT_FN_ATTRS _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition: avxintrin.h:2329
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition: avxintrin.h:707
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition: avxintrin.h:776
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2769
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition: avxintrin.h:2186
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4939
#define __DEFAULT_FN_ATTRS128
Definition: avxintrin.h:64
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition: avxintrin.h:590
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: avxintrin.h:4040
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition: avxintrin.h:1443
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition: avxintrin.h:2458
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition: avxintrin.h:5122
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition: avxintrin.h:119
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition: avxintrin.h:4893
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:629
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2564
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition: avxintrin.h:4335
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition: avxintrin.h:2435
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition: avxintrin.h:3821
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition: avxintrin.h:5039
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition: avxintrin.h:806
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition: avxintrin.h:4362
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition: avxintrin.h:4397
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3476
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition: avxintrin.h:211
static __inline int __DEFAULT_FN_ATTRS _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition: avxintrin.h:2345
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition: avxintrin.h:3634
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4916
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:3240
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition: avxintrin.h:3358
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition: avxintrin.h:274
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition: avxintrin.h:4515
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition: avxintrin.h:3773
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4579
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2507
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition: avxintrin.h:3904
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:3204
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition: avxintrin.h:3741
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3045
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2681
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition: avxintrin.h:3003
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition: avxintrin.h:331
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2534
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition: avxintrin.h:845
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition: avxintrin.h:232
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2652
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition: avxintrin.h:4281
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:3261
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:551
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition: avxintrin.h:3299
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition: avxintrin.h:5064
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition: avxintrin.h:4263
double __v4df __attribute__((__vector_size__(32)))
Definition: avxintrin.h:17
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2593
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2828
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4617
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition: avxintrin.h:83
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition: avxintrin.h:3573
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:3151
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition: avxintrin.h:611
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition: avxintrin.h:3968
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1618
static __inline__ void int __a
Definition: emmintrin.h:4064
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3447
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1866
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1979
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3865
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3896
struct __storeu_i16 *__P __v
Definition: immintrin.h:472
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2042
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:2122
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1879