clang 20.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
55 __min_vector_width__(256)))
56#define __DEFAULT_FN_ATTRS128 \
57 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
58 __min_vector_width__(128)))
59
60/* Arithmetic */
61/// Adds two 256-bit vectors of [4 x double].
62///
63/// \headerfile <x86intrin.h>
64///
65/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
66///
67/// \param __a
68/// A 256-bit vector of [4 x double] containing one of the source operands.
69/// \param __b
70/// A 256-bit vector of [4 x double] containing one of the source operands.
71/// \returns A 256-bit vector of [4 x double] containing the sums of both
72/// operands.
73static __inline __m256d __DEFAULT_FN_ATTRS
74_mm256_add_pd(__m256d __a, __m256d __b)
75{
76 return (__m256d)((__v4df)__a+(__v4df)__b);
77}
78
79/// Adds two 256-bit vectors of [8 x float].
80///
81/// \headerfile <x86intrin.h>
82///
83/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
84///
85/// \param __a
86/// A 256-bit vector of [8 x float] containing one of the source operands.
87/// \param __b
88/// A 256-bit vector of [8 x float] containing one of the source operands.
89/// \returns A 256-bit vector of [8 x float] containing the sums of both
90/// operands.
91static __inline __m256 __DEFAULT_FN_ATTRS
92_mm256_add_ps(__m256 __a, __m256 __b)
93{
94 return (__m256)((__v8sf)__a+(__v8sf)__b);
95}
96
97/// Subtracts two 256-bit vectors of [4 x double].
98///
99/// \headerfile <x86intrin.h>
100///
101/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
102///
103/// \param __a
104/// A 256-bit vector of [4 x double] containing the minuend.
105/// \param __b
106/// A 256-bit vector of [4 x double] containing the subtrahend.
107/// \returns A 256-bit vector of [4 x double] containing the differences between
108/// both operands.
109static __inline __m256d __DEFAULT_FN_ATTRS
110_mm256_sub_pd(__m256d __a, __m256d __b)
111{
112 return (__m256d)((__v4df)__a-(__v4df)__b);
113}
114
115/// Subtracts two 256-bit vectors of [8 x float].
116///
117/// \headerfile <x86intrin.h>
118///
119/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
120///
121/// \param __a
122/// A 256-bit vector of [8 x float] containing the minuend.
123/// \param __b
124/// A 256-bit vector of [8 x float] containing the subtrahend.
125/// \returns A 256-bit vector of [8 x float] containing the differences between
126/// both operands.
127static __inline __m256 __DEFAULT_FN_ATTRS
128_mm256_sub_ps(__m256 __a, __m256 __b)
129{
130 return (__m256)((__v8sf)__a-(__v8sf)__b);
131}
132
133/// Adds the even-indexed values and subtracts the odd-indexed values of
134/// two 256-bit vectors of [4 x double].
135///
136/// \headerfile <x86intrin.h>
137///
138/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
139///
140/// \param __a
141/// A 256-bit vector of [4 x double] containing the left source operand.
142/// \param __b
143/// A 256-bit vector of [4 x double] containing the right source operand.
144/// \returns A 256-bit vector of [4 x double] containing the alternating sums
145/// and differences between both operands.
146static __inline __m256d __DEFAULT_FN_ATTRS
147_mm256_addsub_pd(__m256d __a, __m256d __b)
148{
149 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
150}
151
152/// Adds the even-indexed values and subtracts the odd-indexed values of
153/// two 256-bit vectors of [8 x float].
154///
155/// \headerfile <x86intrin.h>
156///
157/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
158///
159/// \param __a
160/// A 256-bit vector of [8 x float] containing the left source operand.
161/// \param __b
162/// A 256-bit vector of [8 x float] containing the right source operand.
163/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
164/// differences between both operands.
165static __inline __m256 __DEFAULT_FN_ATTRS
166_mm256_addsub_ps(__m256 __a, __m256 __b)
167{
168 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
169}
170
171/// Divides two 256-bit vectors of [4 x double].
172///
173/// \headerfile <x86intrin.h>
174///
175/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
176///
177/// \param __a
178/// A 256-bit vector of [4 x double] containing the dividend.
179/// \param __b
180/// A 256-bit vector of [4 x double] containing the divisor.
181/// \returns A 256-bit vector of [4 x double] containing the quotients of both
182/// operands.
183static __inline __m256d __DEFAULT_FN_ATTRS
184_mm256_div_pd(__m256d __a, __m256d __b)
185{
186 return (__m256d)((__v4df)__a/(__v4df)__b);
187}
188
189/// Divides two 256-bit vectors of [8 x float].
190///
191/// \headerfile <x86intrin.h>
192///
193/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
194///
195/// \param __a
196/// A 256-bit vector of [8 x float] containing the dividend.
197/// \param __b
198/// A 256-bit vector of [8 x float] containing the divisor.
199/// \returns A 256-bit vector of [8 x float] containing the quotients of both
200/// operands.
201static __inline __m256 __DEFAULT_FN_ATTRS
202_mm256_div_ps(__m256 __a, __m256 __b)
203{
204 return (__m256)((__v8sf)__a/(__v8sf)__b);
205}
206
207/// Compares two 256-bit vectors of [4 x double] and returns the greater
208/// of each pair of values.
209///
210/// If either value in a comparison is NaN, returns the value from \a __b.
211///
212/// \headerfile <x86intrin.h>
213///
214/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
215///
216/// \param __a
217/// A 256-bit vector of [4 x double] containing one of the operands.
218/// \param __b
219/// A 256-bit vector of [4 x double] containing one of the operands.
220/// \returns A 256-bit vector of [4 x double] containing the maximum values
221/// between both operands.
222static __inline __m256d __DEFAULT_FN_ATTRS
223_mm256_max_pd(__m256d __a, __m256d __b)
224{
225 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
226}
227
228/// Compares two 256-bit vectors of [8 x float] and returns the greater
229/// of each pair of values.
230///
231/// If either value in a comparison is NaN, returns the value from \a __b.
232///
233/// \headerfile <x86intrin.h>
234///
235/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
236///
237/// \param __a
238/// A 256-bit vector of [8 x float] containing one of the operands.
239/// \param __b
240/// A 256-bit vector of [8 x float] containing one of the operands.
241/// \returns A 256-bit vector of [8 x float] containing the maximum values
242/// between both operands.
243static __inline __m256 __DEFAULT_FN_ATTRS
244_mm256_max_ps(__m256 __a, __m256 __b)
245{
246 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
247}
248
249/// Compares two 256-bit vectors of [4 x double] and returns the lesser
250/// of each pair of values.
251///
252/// If either value in a comparison is NaN, returns the value from \a __b.
253///
254/// \headerfile <x86intrin.h>
255///
256/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
257///
258/// \param __a
259/// A 256-bit vector of [4 x double] containing one of the operands.
260/// \param __b
261/// A 256-bit vector of [4 x double] containing one of the operands.
262/// \returns A 256-bit vector of [4 x double] containing the minimum values
263/// between both operands.
264static __inline __m256d __DEFAULT_FN_ATTRS
265_mm256_min_pd(__m256d __a, __m256d __b)
266{
267 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
268}
269
270/// Compares two 256-bit vectors of [8 x float] and returns the lesser
271/// of each pair of values.
272///
273/// If either value in a comparison is NaN, returns the value from \a __b.
274///
275/// \headerfile <x86intrin.h>
276///
277/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
278///
279/// \param __a
280/// A 256-bit vector of [8 x float] containing one of the operands.
281/// \param __b
282/// A 256-bit vector of [8 x float] containing one of the operands.
283/// \returns A 256-bit vector of [8 x float] containing the minimum values
284/// between both operands.
285static __inline __m256 __DEFAULT_FN_ATTRS
286_mm256_min_ps(__m256 __a, __m256 __b)
287{
288 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
289}
290
291/// Multiplies two 256-bit vectors of [4 x double].
292///
293/// \headerfile <x86intrin.h>
294///
295/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
296///
297/// \param __a
298/// A 256-bit vector of [4 x double] containing one of the operands.
299/// \param __b
300/// A 256-bit vector of [4 x double] containing one of the operands.
301/// \returns A 256-bit vector of [4 x double] containing the products of both
302/// operands.
303static __inline __m256d __DEFAULT_FN_ATTRS
304_mm256_mul_pd(__m256d __a, __m256d __b)
305{
306 return (__m256d)((__v4df)__a * (__v4df)__b);
307}
308
309/// Multiplies two 256-bit vectors of [8 x float].
310///
311/// \headerfile <x86intrin.h>
312///
313/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
314///
315/// \param __a
316/// A 256-bit vector of [8 x float] containing one of the operands.
317/// \param __b
318/// A 256-bit vector of [8 x float] containing one of the operands.
319/// \returns A 256-bit vector of [8 x float] containing the products of both
320/// operands.
321static __inline __m256 __DEFAULT_FN_ATTRS
322_mm256_mul_ps(__m256 __a, __m256 __b)
323{
324 return (__m256)((__v8sf)__a * (__v8sf)__b);
325}
326
327/// Calculates the square roots of the values in a 256-bit vector of
328/// [4 x double].
329///
330/// \headerfile <x86intrin.h>
331///
332/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
333///
334/// \param __a
335/// A 256-bit vector of [4 x double].
336/// \returns A 256-bit vector of [4 x double] containing the square roots of the
337/// values in the operand.
338static __inline __m256d __DEFAULT_FN_ATTRS
340{
341 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
342}
343
344/// Calculates the square roots of the values in a 256-bit vector of
345/// [8 x float].
346///
347/// \headerfile <x86intrin.h>
348///
349/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
350///
351/// \param __a
352/// A 256-bit vector of [8 x float].
353/// \returns A 256-bit vector of [8 x float] containing the square roots of the
354/// values in the operand.
355static __inline __m256 __DEFAULT_FN_ATTRS
357{
358 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
359}
360
361/// Calculates the reciprocal square roots of the values in a 256-bit
362/// vector of [8 x float].
363///
364/// \headerfile <x86intrin.h>
365///
366/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
367///
368/// \param __a
369/// A 256-bit vector of [8 x float].
370/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
371/// roots of the values in the operand.
372static __inline __m256 __DEFAULT_FN_ATTRS
374{
375 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
376}
377
378/// Calculates the reciprocals of the values in a 256-bit vector of
379/// [8 x float].
380///
381/// \headerfile <x86intrin.h>
382///
383/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
384///
385/// \param __a
386/// A 256-bit vector of [8 x float].
387/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
388/// values in the operand.
389static __inline __m256 __DEFAULT_FN_ATTRS
391{
392 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
393}
394
395/// Rounds the values in a 256-bit vector of [4 x double] as specified
396/// by the byte operand. The source values are rounded to integer values and
397/// returned as 64-bit double-precision floating-point values.
398///
399/// \headerfile <x86intrin.h>
400///
401/// \code
402/// __m256d _mm256_round_pd(__m256d V, const int M);
403/// \endcode
404///
405/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
406///
407/// \param V
408/// A 256-bit vector of [4 x double].
409/// \param M
410/// An integer value that specifies the rounding operation. \n
411/// Bits [7:4] are reserved. \n
412/// Bit [3] is a precision exception value: \n
413/// 0: A normal PE exception is used. \n
414/// 1: The PE field is not updated. \n
415/// Bit [2] is the rounding control source: \n
416/// 0: Use bits [1:0] of \a M. \n
417/// 1: Use the current MXCSR setting. \n
418/// Bits [1:0] contain the rounding control definition: \n
419/// 00: Nearest. \n
420/// 01: Downward (toward negative infinity). \n
421/// 10: Upward (toward positive infinity). \n
422/// 11: Truncated.
423/// \returns A 256-bit vector of [4 x double] containing the rounded values.
424#define _mm256_round_pd(V, M) \
425 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
426
427/// Rounds the values stored in a 256-bit vector of [8 x float] as
428/// specified by the byte operand. The source values are rounded to integer
429/// values and returned as floating-point values.
430///
431/// \headerfile <x86intrin.h>
432///
433/// \code
434/// __m256 _mm256_round_ps(__m256 V, const int M);
435/// \endcode
436///
437/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
438///
439/// \param V
440/// A 256-bit vector of [8 x float].
441/// \param M
442/// An integer value that specifies the rounding operation. \n
443/// Bits [7:4] are reserved. \n
444/// Bit [3] is a precision exception value: \n
445/// 0: A normal PE exception is used. \n
446/// 1: The PE field is not updated. \n
447/// Bit [2] is the rounding control source: \n
448/// 0: Use bits [1:0] of \a M. \n
449/// 1: Use the current MXCSR setting. \n
450/// Bits [1:0] contain the rounding control definition: \n
451/// 00: Nearest. \n
452/// 01: Downward (toward negative infinity). \n
453/// 10: Upward (toward positive infinity). \n
454/// 11: Truncated.
455/// \returns A 256-bit vector of [8 x float] containing the rounded values.
456#define _mm256_round_ps(V, M) \
457 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
458
459/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
460/// source values are rounded up to integer values and returned as 64-bit
461/// double-precision floating-point values.
462///
463/// \headerfile <x86intrin.h>
464///
465/// \code
466/// __m256d _mm256_ceil_pd(__m256d V);
467/// \endcode
468///
469/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
470///
471/// \param V
472/// A 256-bit vector of [4 x double].
473/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
474#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
475
476/// Rounds down the values stored in a 256-bit vector of [4 x double].
477/// The source values are rounded down to integer values and returned as
478/// 64-bit double-precision floating-point values.
479///
480/// \headerfile <x86intrin.h>
481///
482/// \code
483/// __m256d _mm256_floor_pd(__m256d V);
484/// \endcode
485///
486/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
487///
488/// \param V
489/// A 256-bit vector of [4 x double].
490/// \returns A 256-bit vector of [4 x double] containing the rounded down
491/// values.
492#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
493
494/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
495/// source values are rounded up to integer values and returned as
496/// floating-point values.
497///
498/// \headerfile <x86intrin.h>
499///
500/// \code
501/// __m256 _mm256_ceil_ps(__m256 V);
502/// \endcode
503///
504/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
505///
506/// \param V
507/// A 256-bit vector of [8 x float].
508/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
509#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
510
511/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
512/// source values are rounded down to integer values and returned as
513/// floating-point values.
514///
515/// \headerfile <x86intrin.h>
516///
517/// \code
518/// __m256 _mm256_floor_ps(__m256 V);
519/// \endcode
520///
521/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
522///
523/// \param V
524/// A 256-bit vector of [8 x float].
525/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
526#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
527
528/* Logical */
529/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
530///
531/// \headerfile <x86intrin.h>
532///
533/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
534///
535/// \param __a
536/// A 256-bit vector of [4 x double] containing one of the source operands.
537/// \param __b
538/// A 256-bit vector of [4 x double] containing one of the source operands.
539/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
540/// values between both operands.
541static __inline __m256d __DEFAULT_FN_ATTRS
542_mm256_and_pd(__m256d __a, __m256d __b)
543{
544 return (__m256d)((__v4du)__a & (__v4du)__b);
545}
546
547/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
548///
549/// \headerfile <x86intrin.h>
550///
551/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
552///
553/// \param __a
554/// A 256-bit vector of [8 x float] containing one of the source operands.
555/// \param __b
556/// A 256-bit vector of [8 x float] containing one of the source operands.
557/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
558/// values between both operands.
559static __inline __m256 __DEFAULT_FN_ATTRS
560_mm256_and_ps(__m256 __a, __m256 __b)
561{
562 return (__m256)((__v8su)__a & (__v8su)__b);
563}
564
565/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
566/// the one's complement of the values contained in the first source operand.
567///
568/// \headerfile <x86intrin.h>
569///
570/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
571///
572/// \param __a
573/// A 256-bit vector of [4 x double] containing the left source operand. The
574/// one's complement of this value is used in the bitwise AND.
575/// \param __b
576/// A 256-bit vector of [4 x double] containing the right source operand.
577/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
578/// values of the second operand and the one's complement of the first
579/// operand.
580static __inline __m256d __DEFAULT_FN_ATTRS
581_mm256_andnot_pd(__m256d __a, __m256d __b)
582{
583 return (__m256d)(~(__v4du)__a & (__v4du)__b);
584}
585
586/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
587/// the one's complement of the values contained in the first source operand.
588///
589/// \headerfile <x86intrin.h>
590///
591/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
592///
593/// \param __a
594/// A 256-bit vector of [8 x float] containing the left source operand. The
595/// one's complement of this value is used in the bitwise AND.
596/// \param __b
597/// A 256-bit vector of [8 x float] containing the right source operand.
598/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
599/// values of the second operand and the one's complement of the first
600/// operand.
601static __inline __m256 __DEFAULT_FN_ATTRS
602_mm256_andnot_ps(__m256 __a, __m256 __b)
603{
604 return (__m256)(~(__v8su)__a & (__v8su)__b);
605}
606
607/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
608///
609/// \headerfile <x86intrin.h>
610///
611/// This intrinsic corresponds to the <c> VORPD </c> instruction.
612///
613/// \param __a
614/// A 256-bit vector of [4 x double] containing one of the source operands.
615/// \param __b
616/// A 256-bit vector of [4 x double] containing one of the source operands.
617/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
618/// values between both operands.
619static __inline __m256d __DEFAULT_FN_ATTRS
620_mm256_or_pd(__m256d __a, __m256d __b)
621{
622 return (__m256d)((__v4du)__a | (__v4du)__b);
623}
624
625/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
626///
627/// \headerfile <x86intrin.h>
628///
629/// This intrinsic corresponds to the <c> VORPS </c> instruction.
630///
631/// \param __a
632/// A 256-bit vector of [8 x float] containing one of the source operands.
633/// \param __b
634/// A 256-bit vector of [8 x float] containing one of the source operands.
635/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
636/// values between both operands.
637static __inline __m256 __DEFAULT_FN_ATTRS
638_mm256_or_ps(__m256 __a, __m256 __b)
639{
640 return (__m256)((__v8su)__a | (__v8su)__b);
641}
642
643/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
644///
645/// \headerfile <x86intrin.h>
646///
647/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
648///
649/// \param __a
650/// A 256-bit vector of [4 x double] containing one of the source operands.
651/// \param __b
652/// A 256-bit vector of [4 x double] containing one of the source operands.
653/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
654/// values between both operands.
655static __inline __m256d __DEFAULT_FN_ATTRS
656_mm256_xor_pd(__m256d __a, __m256d __b)
657{
658 return (__m256d)((__v4du)__a ^ (__v4du)__b);
659}
660
661/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
662///
663/// \headerfile <x86intrin.h>
664///
665/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
666///
667/// \param __a
668/// A 256-bit vector of [8 x float] containing one of the source operands.
669/// \param __b
670/// A 256-bit vector of [8 x float] containing one of the source operands.
671/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
672/// values between both operands.
673static __inline __m256 __DEFAULT_FN_ATTRS
674_mm256_xor_ps(__m256 __a, __m256 __b)
675{
676 return (__m256)((__v8su)__a ^ (__v8su)__b);
677}
678
679/* Horizontal arithmetic */
680/// Horizontally adds the adjacent pairs of values contained in two
681/// 256-bit vectors of [4 x double].
682///
683/// \headerfile <x86intrin.h>
684///
685/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
686///
687/// \param __a
688/// A 256-bit vector of [4 x double] containing one of the source operands.
689/// The horizontal sums of the values are returned in the even-indexed
690/// elements of a vector of [4 x double].
691/// \param __b
692/// A 256-bit vector of [4 x double] containing one of the source operands.
693/// The horizontal sums of the values are returned in the odd-indexed
694/// elements of a vector of [4 x double].
695/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
696/// both operands.
697static __inline __m256d __DEFAULT_FN_ATTRS
698_mm256_hadd_pd(__m256d __a, __m256d __b)
699{
700 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
701}
702
703/// Horizontally adds the adjacent pairs of values contained in two
704/// 256-bit vectors of [8 x float].
705///
706/// \headerfile <x86intrin.h>
707///
708/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
709///
710/// \param __a
711/// A 256-bit vector of [8 x float] containing one of the source operands.
712/// The horizontal sums of the values are returned in the elements with
713/// index 0, 1, 4, 5 of a vector of [8 x float].
714/// \param __b
715/// A 256-bit vector of [8 x float] containing one of the source operands.
716/// The horizontal sums of the values are returned in the elements with
717/// index 2, 3, 6, 7 of a vector of [8 x float].
718/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
719/// both operands.
720static __inline __m256 __DEFAULT_FN_ATTRS
721_mm256_hadd_ps(__m256 __a, __m256 __b)
722{
723 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
724}
725
726/// Horizontally subtracts the adjacent pairs of values contained in two
727/// 256-bit vectors of [4 x double].
728///
729/// \headerfile <x86intrin.h>
730///
731/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
732///
733/// \param __a
734/// A 256-bit vector of [4 x double] containing one of the source operands.
735/// The horizontal differences between the values are returned in the
736/// even-indexed elements of a vector of [4 x double].
737/// \param __b
738/// A 256-bit vector of [4 x double] containing one of the source operands.
739/// The horizontal differences between the values are returned in the
740/// odd-indexed elements of a vector of [4 x double].
741/// \returns A 256-bit vector of [4 x double] containing the horizontal
742/// differences of both operands.
743static __inline __m256d __DEFAULT_FN_ATTRS
744_mm256_hsub_pd(__m256d __a, __m256d __b)
745{
746 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
747}
748
749/// Horizontally subtracts the adjacent pairs of values contained in two
750/// 256-bit vectors of [8 x float].
751///
752/// \headerfile <x86intrin.h>
753///
754/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
755///
756/// \param __a
757/// A 256-bit vector of [8 x float] containing one of the source operands.
758/// The horizontal differences between the values are returned in the
759/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
760/// \param __b
761/// A 256-bit vector of [8 x float] containing one of the source operands.
762/// The horizontal differences between the values are returned in the
763/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
764/// \returns A 256-bit vector of [8 x float] containing the horizontal
765/// differences of both operands.
766static __inline __m256 __DEFAULT_FN_ATTRS
767_mm256_hsub_ps(__m256 __a, __m256 __b)
768{
769 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
770}
771
772/* Vector permutations */
773/// Copies the values in a 128-bit vector of [2 x double] as specified
774/// by the 128-bit integer vector operand.
775///
776/// \headerfile <x86intrin.h>
777///
778/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
779///
780/// \param __a
781/// A 128-bit vector of [2 x double].
782/// \param __c
783/// A 128-bit integer vector operand specifying how the values are to be
784/// copied. \n
785/// Bit [1]: \n
786/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
787/// vector. \n
788/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
789/// returned vector. \n
790/// Bit [65]: \n
791/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
792/// returned vector. \n
793/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
794/// returned vector.
795/// \returns A 128-bit vector of [2 x double] containing the copied values.
796static __inline __m128d __DEFAULT_FN_ATTRS128
797_mm_permutevar_pd(__m128d __a, __m128i __c)
798{
799 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
800}
801
802/// Copies the values in a 256-bit vector of [4 x double] as specified
803/// by the 256-bit integer vector operand.
804///
805/// \headerfile <x86intrin.h>
806///
807/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
808///
809/// \param __a
810/// A 256-bit vector of [4 x double].
811/// \param __c
812/// A 256-bit integer vector operand specifying how the values are to be
813/// copied. \n
814/// Bit [1]: \n
815/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
816/// vector. \n
817/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
818/// returned vector. \n
819/// Bit [65]: \n
820/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
821/// returned vector. \n
822/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
823/// returned vector. \n
824/// Bit [129]: \n
825/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
826/// returned vector. \n
827/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
828/// returned vector. \n
829/// Bit [193]: \n
830/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
831/// returned vector. \n
832/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
833/// returned vector.
834/// \returns A 256-bit vector of [4 x double] containing the copied values.
835static __inline __m256d __DEFAULT_FN_ATTRS
836_mm256_permutevar_pd(__m256d __a, __m256i __c)
837{
838 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
839}
840
841/// Copies the values stored in a 128-bit vector of [4 x float] as
842/// specified by the 128-bit integer vector operand.
843///
844/// \headerfile <x86intrin.h>
845///
846/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
847///
848/// \param __a
849/// A 128-bit vector of [4 x float].
850/// \param __c
851/// A 128-bit integer vector operand specifying how the values are to be
852/// copied. \n
853/// Bits [1:0]: \n
854/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
855/// returned vector. \n
856/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
857/// returned vector. \n
858/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
859/// returned vector. \n
860/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
861/// returned vector. \n
862/// Bits [33:32]: \n
863/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
864/// returned vector. \n
865/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
866/// returned vector. \n
867/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
868/// returned vector. \n
869/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
870/// returned vector. \n
871/// Bits [65:64]: \n
872/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
873/// returned vector. \n
874/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
875/// returned vector. \n
876/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
877/// returned vector. \n
878/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
879/// returned vector. \n
880/// Bits [97:96]: \n
881/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
882/// returned vector. \n
883/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
884/// returned vector. \n
885/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
886/// returned vector. \n
887/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
888/// returned vector.
889/// \returns A 128-bit vector of [4 x float] containing the copied values.
890static __inline __m128 __DEFAULT_FN_ATTRS128
891_mm_permutevar_ps(__m128 __a, __m128i __c)
892{
893 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
894}
895
896/// Copies the values stored in a 256-bit vector of [8 x float] as
897/// specified by the 256-bit integer vector operand.
898///
899/// \headerfile <x86intrin.h>
900///
901/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
902///
903/// \param __a
904/// A 256-bit vector of [8 x float].
905/// \param __c
906/// A 256-bit integer vector operand specifying how the values are to be
907/// copied. \n
908/// Bits [1:0]: \n
909/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
910/// returned vector. \n
911/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
912/// returned vector. \n
913/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
914/// returned vector. \n
915/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
916/// returned vector. \n
917/// Bits [33:32]: \n
918/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
919/// returned vector. \n
920/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
921/// returned vector. \n
922/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
923/// returned vector. \n
924/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
925/// returned vector. \n
926/// Bits [65:64]: \n
927/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
928/// returned vector. \n
929/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
930/// returned vector. \n
931/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
932/// returned vector. \n
933/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
934/// returned vector. \n
935/// Bits [97:96]: \n
936/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
937/// returned vector. \n
938/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
939/// returned vector. \n
940/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
941/// returned vector. \n
942/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
943/// returned vector. \n
944/// Bits [129:128]: \n
945/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
946/// returned vector. \n
947/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
948/// returned vector. \n
949/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
950/// returned vector. \n
951/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
952/// returned vector. \n
953/// Bits [161:160]: \n
954/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
955/// returned vector. \n
956/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
957/// returned vector. \n
958/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
959/// returned vector. \n
960/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
961/// returned vector. \n
962/// Bits [193:192]: \n
963/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
964/// returned vector. \n
965/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
966/// returned vector. \n
967/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
968/// returned vector. \n
969/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
970/// returned vector. \n
971/// Bits [225:224]: \n
972/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
973/// returned vector. \n
974/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
975/// returned vector. \n
976/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
977/// returned vector. \n
978/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
979/// returned vector.
980/// \returns A 256-bit vector of [8 x float] containing the copied values.
981static __inline __m256 __DEFAULT_FN_ATTRS
983{
984 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
985}
986
987/// Copies the values in a 128-bit vector of [2 x double] as specified
988/// by the immediate integer operand.
989///
990/// \headerfile <x86intrin.h>
991///
992/// \code
993/// __m128d _mm_permute_pd(__m128d A, const int C);
994/// \endcode
995///
996/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
997///
998/// \param A
999/// A 128-bit vector of [2 x double].
1000/// \param C
1001/// An immediate integer operand specifying how the values are to be
1002/// copied. \n
1003/// Bit [0]: \n
1004/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1005/// vector. \n
1006/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1007/// returned vector. \n
1008/// Bit [1]: \n
1009/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1010/// returned vector. \n
1011/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1012/// returned vector.
1013/// \returns A 128-bit vector of [2 x double] containing the copied values.
1014#define _mm_permute_pd(A, C) \
1015 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1016
1017/// Copies the values in a 256-bit vector of [4 x double] as specified by
1018/// the immediate integer operand.
1019///
1020/// \headerfile <x86intrin.h>
1021///
1022/// \code
1023/// __m256d _mm256_permute_pd(__m256d A, const int C);
1024/// \endcode
1025///
1026/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1027///
1028/// \param A
1029/// A 256-bit vector of [4 x double].
1030/// \param C
1031/// An immediate integer operand specifying how the values are to be
1032/// copied. \n
1033/// Bit [0]: \n
1034/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1035/// vector. \n
1036/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1037/// returned vector. \n
1038/// Bit [1]: \n
1039/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1040/// returned vector. \n
1041/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1042/// returned vector. \n
1043/// Bit [2]: \n
1044/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1045/// returned vector. \n
1046/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1047/// returned vector. \n
1048/// Bit [3]: \n
1049/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1050/// returned vector. \n
1051/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1052/// returned vector.
1053/// \returns A 256-bit vector of [4 x double] containing the copied values.
1054#define _mm256_permute_pd(A, C) \
1055 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1056
1057/// Copies the values in a 128-bit vector of [4 x float] as specified by
1058/// the immediate integer operand.
1059///
1060/// \headerfile <x86intrin.h>
1061///
1062/// \code
1063/// __m128 _mm_permute_ps(__m128 A, const int C);
1064/// \endcode
1065///
1066/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1067///
1068/// \param A
1069/// A 128-bit vector of [4 x float].
1070/// \param C
1071/// An immediate integer operand specifying how the values are to be
1072/// copied. \n
1073/// Bits [1:0]: \n
1074/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1075/// returned vector. \n
1076/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1077/// returned vector. \n
1078/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1079/// returned vector. \n
1080/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1081/// returned vector. \n
1082/// Bits [3:2]: \n
1083/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1084/// returned vector. \n
1085/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1086/// returned vector. \n
1087/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1088/// returned vector. \n
1089/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1090/// returned vector. \n
1091/// Bits [5:4]: \n
1092/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1093/// returned vector. \n
1094/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1095/// returned vector. \n
1096/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1097/// returned vector. \n
1098/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1099/// returned vector. \n
1100/// Bits [7:6]: \n
1101/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1102/// returned vector. \n
1103/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1104/// returned vector. \n
1105/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1106/// returned vector. \n
1107/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1108/// returned vector.
1109/// \returns A 128-bit vector of [4 x float] containing the copied values.
1110#define _mm_permute_ps(A, C) \
1111 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1112
1113/// Copies the values in a 256-bit vector of [8 x float] as specified by
1114/// the immediate integer operand.
1115///
1116/// \headerfile <x86intrin.h>
1117///
1118/// \code
1119/// __m256 _mm256_permute_ps(__m256 A, const int C);
1120/// \endcode
1121///
1122/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1123///
1124/// \param A
1125/// A 256-bit vector of [8 x float].
1126/// \param C
1127/// An immediate integer operand specifying how the values are to be
1128/// copied. \n
1129/// Bits [1:0]: \n
1130/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1131/// returned vector. \n
1132/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1133/// returned vector. \n
1134/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1135/// returned vector. \n
1136/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1137/// returned vector. \n
1138/// Bits [3:2]: \n
1139/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1140/// returned vector. \n
1141/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1142/// returned vector. \n
1143/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1144/// returned vector. \n
1145/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1146/// returned vector. \n
1147/// Bits [5:4]: \n
1148/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1149/// returned vector. \n
1150/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1151/// returned vector. \n
1152/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1153/// returned vector. \n
1154/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1155/// returned vector. \n
1156/// Bits [7:6]: \n
1157/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1158/// returned vector. \n
1159/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1160/// returned vector. \n
1161/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1162/// returned vector. \n
1163/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1164/// returned vector. \n
1165/// Bits [1:0]: \n
1166/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1167/// returned vector. \n
1168/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1169/// returned vector. \n
1170/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1171/// returned vector. \n
1172/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1173/// returned vector. \n
1174/// Bits [3:2]: \n
1175/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1176/// returned vector. \n
1177/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1178/// returned vector. \n
1179/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1180/// returned vector. \n
1181/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1182/// returned vector. \n
1183/// Bits [5:4]: \n
1184/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1185/// returned vector. \n
1186/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1187/// returned vector. \n
1188/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1189/// returned vector. \n
1190/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1191/// returned vector. \n
1192/// Bits [7:6]: \n
1193/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1194/// returned vector. \n
1195/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1196/// returned vector. \n
1197/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1198/// returned vector. \n
1199/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1200/// returned vector.
1201/// \returns A 256-bit vector of [8 x float] containing the copied values.
1202#define _mm256_permute_ps(A, C) \
1203 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1204
1205/// Permutes 128-bit data values stored in two 256-bit vectors of
1206/// [4 x double], as specified by the immediate integer operand.
1207///
1208/// \headerfile <x86intrin.h>
1209///
1210/// \code
1211/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1212/// \endcode
1213///
1214/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1215///
1216/// \param V1
1217/// A 256-bit vector of [4 x double].
1218/// \param V2
1219/// A 256-bit vector of [4 x double.
1220/// \param M
1221/// An immediate integer operand specifying how the values are to be
1222/// permuted. \n
1223/// Bits [1:0]: \n
1224/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1225/// destination. \n
1226/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1227/// destination. \n
1228/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1229/// destination. \n
1230/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1231/// destination. \n
1232/// Bits [5:4]: \n
1233/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1234/// destination. \n
1235/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1236/// destination. \n
1237/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1238/// destination. \n
1239/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1240/// destination.
1241/// \returns A 256-bit vector of [4 x double] containing the copied values.
1242#define _mm256_permute2f128_pd(V1, V2, M) \
1243 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1244 (__v4df)(__m256d)(V2), (int)(M)))
1245
1246/// Permutes 128-bit data values stored in two 256-bit vectors of
1247/// [8 x float], as specified by the immediate integer operand.
1248///
1249/// \headerfile <x86intrin.h>
1250///
1251/// \code
1252/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1253/// \endcode
1254///
1255/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1256///
1257/// \param V1
1258/// A 256-bit vector of [8 x float].
1259/// \param V2
1260/// A 256-bit vector of [8 x float].
1261/// \param M
1262/// An immediate integer operand specifying how the values are to be
1263/// permuted. \n
1264/// Bits [1:0]: \n
1265/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1266/// destination. \n
1267/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1268/// destination. \n
1269/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1270/// destination. \n
1271/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1272/// destination. \n
1273/// Bits [5:4]: \n
1274/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1275/// destination. \n
1276/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1277/// destination. \n
1278/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1279/// destination. \n
1280/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1281/// destination.
1282/// \returns A 256-bit vector of [8 x float] containing the copied values.
1283#define _mm256_permute2f128_ps(V1, V2, M) \
1284 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1285 (__v8sf)(__m256)(V2), (int)(M)))
1286
1287/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1288/// as specified by the immediate integer operand.
1289///
1290/// \headerfile <x86intrin.h>
1291///
1292/// \code
1293/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1294/// \endcode
1295///
1296/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1297///
1298/// \param V1
1299/// A 256-bit integer vector.
1300/// \param V2
1301/// A 256-bit integer vector.
1302/// \param M
1303/// An immediate integer operand specifying how the values are to be copied.
1304/// Bits [1:0]: \n
1305/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1306/// destination. \n
1307/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1308/// destination. \n
1309/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1310/// destination. \n
1311/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1312/// destination. \n
1313/// Bits [5:4]: \n
1314/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1315/// destination. \n
1316/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1317/// destination. \n
1318/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1319/// destination. \n
1320/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1321/// destination.
1322/// \returns A 256-bit integer vector containing the copied values.
1323#define _mm256_permute2f128_si256(V1, V2, M) \
1324 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1325 (__v8si)(__m256i)(V2), (int)(M)))
1326
1327/* Vector Blend */
1328/// Merges 64-bit double-precision data values stored in either of the
1329/// two 256-bit vectors of [4 x double], as specified by the immediate
1330/// integer operand.
1331///
1332/// \headerfile <x86intrin.h>
1333///
1334/// \code
1335/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1336/// \endcode
1337///
1338/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1339///
1340/// \param V1
1341/// A 256-bit vector of [4 x double].
1342/// \param V2
1343/// A 256-bit vector of [4 x double].
1344/// \param M
1345/// An immediate integer operand, with mask bits [3:0] specifying how the
1346/// values are to be copied. The position of the mask bit corresponds to the
1347/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1348/// element in operand \a V1 is copied to the same position in the
1349/// destination. When a mask bit is 1, the corresponding 64-bit element in
1350/// operand \a V2 is copied to the same position in the destination.
1351/// \returns A 256-bit vector of [4 x double] containing the copied values.
1352#define _mm256_blend_pd(V1, V2, M) \
1353 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1354 (__v4df)(__m256d)(V2), (int)(M)))
1355
1356/// Merges 32-bit single-precision data values stored in either of the
1357/// two 256-bit vectors of [8 x float], as specified by the immediate
1358/// integer operand.
1359///
1360/// \headerfile <x86intrin.h>
1361///
1362/// \code
1363/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1364/// \endcode
1365///
1366/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1367///
1368/// \param V1
1369/// A 256-bit vector of [8 x float].
1370/// \param V2
1371/// A 256-bit vector of [8 x float].
1372/// \param M
1373/// An immediate integer operand, with mask bits [7:0] specifying how the
1374/// values are to be copied. The position of the mask bit corresponds to the
1375/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1376/// element in operand \a V1 is copied to the same position in the
1377/// destination. When a mask bit is 1, the corresponding 32-bit element in
1378/// operand \a V2 is copied to the same position in the destination.
1379/// \returns A 256-bit vector of [8 x float] containing the copied values.
1380#define _mm256_blend_ps(V1, V2, M) \
1381 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1382 (__v8sf)(__m256)(V2), (int)(M)))
1383
1384/// Merges 64-bit double-precision data values stored in either of the
1385/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1386/// operand.
1387///
1388/// \headerfile <x86intrin.h>
1389///
1390/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1391///
1392/// \param __a
1393/// A 256-bit vector of [4 x double].
1394/// \param __b
1395/// A 256-bit vector of [4 x double].
1396/// \param __c
1397/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1398/// how the values are to be copied. The position of the mask bit corresponds
1399/// to the most significant bit of a copied value. When a mask bit is 0, the
1400/// corresponding 64-bit element in operand \a __a is copied to the same
1401/// position in the destination. When a mask bit is 1, the corresponding
1402/// 64-bit element in operand \a __b is copied to the same position in the
1403/// destination.
1404/// \returns A 256-bit vector of [4 x double] containing the copied values.
1405static __inline __m256d __DEFAULT_FN_ATTRS
1406_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1407{
1408 return (__m256d)__builtin_ia32_blendvpd256(
1409 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1410}
1411
1412/// Merges 32-bit single-precision data values stored in either of the
1413/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1414/// operand.
1415///
1416/// \headerfile <x86intrin.h>
1417///
1418/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1419///
1420/// \param __a
1421/// A 256-bit vector of [8 x float].
1422/// \param __b
1423/// A 256-bit vector of [8 x float].
1424/// \param __c
1425/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1426/// and 31 specifying how the values are to be copied. The position of the
1427/// mask bit corresponds to the most significant bit of a copied value. When
1428/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1429/// copied to the same position in the destination. When a mask bit is 1, the
1430/// corresponding 32-bit element in operand \a __b is copied to the same
1431/// position in the destination.
1432/// \returns A 256-bit vector of [8 x float] containing the copied values.
1433static __inline __m256 __DEFAULT_FN_ATTRS
1434_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1435{
1436 return (__m256)__builtin_ia32_blendvps256(
1437 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1438}
1439
1440/* Vector Dot Product */
1441/// Computes two dot products in parallel, using the lower and upper
1442/// halves of two [8 x float] vectors as input to the two computations, and
1443/// returning the two dot products in the lower and upper halves of the
1444/// [8 x float] result.
1445///
1446/// The immediate integer operand controls which input elements will
1447/// contribute to the dot product, and where the final results are returned.
1448/// In general, for each dot product, the four corresponding elements of the
1449/// input vectors are multiplied; the first two and second two products are
1450/// summed, then the two sums are added to form the final result.
1451///
1452/// \headerfile <x86intrin.h>
1453///
1454/// \code
1455/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1456/// \endcode
1457///
1458/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1459///
1460/// \param V1
1461/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1462/// \param V2
1463/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1464/// \param M
1465/// An immediate integer argument. Bits [7:4] determine which elements of
1466/// the input vectors are used, with bit [4] corresponding to the lowest
1467/// element and bit [7] corresponding to the highest element of each [4 x
1468/// float] subvector. If a bit is set, the corresponding elements from the
1469/// two input vectors are used as an input for dot product; otherwise that
1470/// input is treated as zero. Bits [3:0] determine which elements of the
1471/// result will receive a copy of the final dot product, with bit [0]
1472/// corresponding to the lowest element and bit [3] corresponding to the
1473/// highest element of each [4 x float] subvector. If a bit is set, the dot
1474/// product is returned in the corresponding element; otherwise that element
1475/// is set to zero. The bitmask is applied in the same way to each of the
1476/// two parallel dot product computations.
1477/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1478#define _mm256_dp_ps(V1, V2, M) \
1479 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1480 (__v8sf)(__m256)(V2), (M)))
1481
1482/* Vector shuffle */
1483/// Selects 8 float values from the 256-bit operands of [8 x float], as
1484/// specified by the immediate value operand.
1485///
1486/// The four selected elements in each operand are copied to the destination
1487/// according to the bits specified in the immediate operand. The selected
1488/// elements from the first 256-bit operand are copied to bits [63:0] and
1489/// bits [191:128] of the destination, and the selected elements from the
1490/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1491/// the destination. For example, if bits [7:0] of the immediate operand
1492/// contain a value of 0xFF, the 256-bit destination vector would contain the
1493/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1494///
1495/// \headerfile <x86intrin.h>
1496///
1497/// \code
1498/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1499/// \endcode
1500///
1501/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1502///
1503/// \param a
1504/// A 256-bit vector of [8 x float]. The four selected elements in this
1505/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1506/// according to the bits specified in the immediate operand.
1507/// \param b
1508/// A 256-bit vector of [8 x float]. The four selected elements in this
1509/// operand are copied to bits [127:64] and bits [255:192] in the
1510/// destination, according to the bits specified in the immediate operand.
1511/// \param mask
1512/// An immediate value containing an 8-bit value specifying which elements to
1513/// copy from \a a and \a b \n.
1514/// Bits [3:0] specify the values copied from operand \a a. \n
1515/// Bits [7:4] specify the values copied from operand \a b. \n
1516/// The destinations within the 256-bit destination are assigned values as
1517/// follows, according to the bit value assignments described below: \n
1518/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1519/// destination. \n
1520/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1521/// destination. \n
1522/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1523/// destination. \n
1524/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1525/// the destination. \n
1526/// Bit value assignments: \n
1527/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1528/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1529/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1530/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1531/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1532/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1533/// <c>[b6, b4, b2, b0]</c>.
1534/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1535#define _mm256_shuffle_ps(a, b, mask) \
1536 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1537 (__v8sf)(__m256)(b), (int)(mask)))
1538
1539/// Selects four double-precision values from the 256-bit operands of
1540/// [4 x double], as specified by the immediate value operand.
1541///
1542/// The selected elements from the first 256-bit operand are copied to bits
1543/// [63:0] and bits [191:128] in the destination, and the selected elements
1544/// from the second 256-bit operand are copied to bits [127:64] and bits
1545/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1546/// operand contain a value of 0xF, the 256-bit destination vector would
1547/// contain the following values: b[3], a[3], b[1], a[1].
1548///
1549/// \headerfile <x86intrin.h>
1550///
1551/// \code
1552/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1553/// \endcode
1554///
1555/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1556///
1557/// \param a
1558/// A 256-bit vector of [4 x double].
1559/// \param b
1560/// A 256-bit vector of [4 x double].
1561/// \param mask
1562/// An immediate value containing 8-bit values specifying which elements to
1563/// copy from \a a and \a b: \n
1564/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1565/// destination. \n
1566/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1567/// destination. \n
1568/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1569/// destination. \n
1570/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1571/// destination. \n
1572/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1573/// destination. \n
1574/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1575/// destination. \n
1576/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1577/// destination. \n
1578/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1579/// destination.
1580/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1581#define _mm256_shuffle_pd(a, b, mask) \
1582 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1583 (__v4df)(__m256d)(b), (int)(mask)))
1584
1585/* Compare */
1586#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1587#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1588#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1589#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1590#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1591#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1592#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1593#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1594#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1595#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1596#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1597#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1598#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1599#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1600#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1601#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1602#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1603#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1604#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1605#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1606#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1607#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1608#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1609#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1610
1611/* Below intrinsic defined in emmintrin.h can be used for AVX */
1612/// Compares each of the corresponding double-precision values of two
1613/// 128-bit vectors of [2 x double], using the operation specified by the
1614/// immediate integer operand.
1615///
1616/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1617/// If either value in a comparison is NaN, comparisons that are ordered
1618/// return false, and comparisons that are unordered return true.
1619///
1620/// \headerfile <x86intrin.h>
1621///
1622/// \code
1623/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1624/// \endcode
1625///
1626/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1627///
1628/// \param a
1629/// A 128-bit vector of [2 x double].
1630/// \param b
1631/// A 128-bit vector of [2 x double].
1632/// \param c
1633/// An immediate integer operand, with bits [4:0] specifying which comparison
1634/// operation to use: \n
1635/// 0x00: Equal (ordered, non-signaling) \n
1636/// 0x01: Less-than (ordered, signaling) \n
1637/// 0x02: Less-than-or-equal (ordered, signaling) \n
1638/// 0x03: Unordered (non-signaling) \n
1639/// 0x04: Not-equal (unordered, non-signaling) \n
1640/// 0x05: Not-less-than (unordered, signaling) \n
1641/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1642/// 0x07: Ordered (non-signaling) \n
1643/// 0x08: Equal (unordered, non-signaling) \n
1644/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1645/// 0x0A: Not-greater-than (unordered, signaling) \n
1646/// 0x0B: False (ordered, non-signaling) \n
1647/// 0x0C: Not-equal (ordered, non-signaling) \n
1648/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1649/// 0x0E: Greater-than (ordered, signaling) \n
1650/// 0x0F: True (unordered, non-signaling) \n
1651/// 0x10: Equal (ordered, signaling) \n
1652/// 0x11: Less-than (ordered, non-signaling) \n
1653/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1654/// 0x13: Unordered (signaling) \n
1655/// 0x14: Not-equal (unordered, signaling) \n
1656/// 0x15: Not-less-than (unordered, non-signaling) \n
1657/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1658/// 0x17: Ordered (signaling) \n
1659/// 0x18: Equal (unordered, signaling) \n
1660/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1661/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1662/// 0x1B: False (ordered, signaling) \n
1663/// 0x1C: Not-equal (ordered, signaling) \n
1664/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1665/// 0x1E: Greater-than (ordered, non-signaling) \n
1666/// 0x1F: True (unordered, signaling)
1667/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1668/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1669
1670/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1671/// Compares each of the corresponding values of two 128-bit vectors of
1672/// [4 x float], using the operation specified by the immediate integer
1673/// operand.
1674///
1675/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1676/// If either value in a comparison is NaN, comparisons that are ordered
1677/// return false, and comparisons that are unordered return true.
1678///
1679/// \headerfile <x86intrin.h>
1680///
1681/// \code
1682/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1683/// \endcode
1684///
1685/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1686///
1687/// \param a
1688/// A 128-bit vector of [4 x float].
1689/// \param b
1690/// A 128-bit vector of [4 x float].
1691/// \param c
1692/// An immediate integer operand, with bits [4:0] specifying which comparison
1693/// operation to use: \n
1694/// 0x00: Equal (ordered, non-signaling) \n
1695/// 0x01: Less-than (ordered, signaling) \n
1696/// 0x02: Less-than-or-equal (ordered, signaling) \n
1697/// 0x03: Unordered (non-signaling) \n
1698/// 0x04: Not-equal (unordered, non-signaling) \n
1699/// 0x05: Not-less-than (unordered, signaling) \n
1700/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1701/// 0x07: Ordered (non-signaling) \n
1702/// 0x08: Equal (unordered, non-signaling) \n
1703/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1704/// 0x0A: Not-greater-than (unordered, signaling) \n
1705/// 0x0B: False (ordered, non-signaling) \n
1706/// 0x0C: Not-equal (ordered, non-signaling) \n
1707/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1708/// 0x0E: Greater-than (ordered, signaling) \n
1709/// 0x0F: True (unordered, non-signaling) \n
1710/// 0x10: Equal (ordered, signaling) \n
1711/// 0x11: Less-than (ordered, non-signaling) \n
1712/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1713/// 0x13: Unordered (signaling) \n
1714/// 0x14: Not-equal (unordered, signaling) \n
1715/// 0x15: Not-less-than (unordered, non-signaling) \n
1716/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1717/// 0x17: Ordered (signaling) \n
1718/// 0x18: Equal (unordered, signaling) \n
1719/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1720/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1721/// 0x1B: False (ordered, signaling) \n
1722/// 0x1C: Not-equal (ordered, signaling) \n
1723/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1724/// 0x1E: Greater-than (ordered, non-signaling) \n
1725/// 0x1F: True (unordered, signaling)
1726/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1727/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1728
1729/// Compares each of the corresponding double-precision values of two
1730/// 256-bit vectors of [4 x double], using the operation specified by the
1731/// immediate integer operand.
1732///
1733/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1734/// If either value in a comparison is NaN, comparisons that are ordered
1735/// return false, and comparisons that are unordered return true.
1736///
1737/// \headerfile <x86intrin.h>
1738///
1739/// \code
1740/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1741/// \endcode
1742///
1743/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1744///
1745/// \param a
1746/// A 256-bit vector of [4 x double].
1747/// \param b
1748/// A 256-bit vector of [4 x double].
1749/// \param c
1750/// An immediate integer operand, with bits [4:0] specifying which comparison
1751/// operation to use: \n
1752/// 0x00: Equal (ordered, non-signaling) \n
1753/// 0x01: Less-than (ordered, signaling) \n
1754/// 0x02: Less-than-or-equal (ordered, signaling) \n
1755/// 0x03: Unordered (non-signaling) \n
1756/// 0x04: Not-equal (unordered, non-signaling) \n
1757/// 0x05: Not-less-than (unordered, signaling) \n
1758/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1759/// 0x07: Ordered (non-signaling) \n
1760/// 0x08: Equal (unordered, non-signaling) \n
1761/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1762/// 0x0A: Not-greater-than (unordered, signaling) \n
1763/// 0x0B: False (ordered, non-signaling) \n
1764/// 0x0C: Not-equal (ordered, non-signaling) \n
1765/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1766/// 0x0E: Greater-than (ordered, signaling) \n
1767/// 0x0F: True (unordered, non-signaling) \n
1768/// 0x10: Equal (ordered, signaling) \n
1769/// 0x11: Less-than (ordered, non-signaling) \n
1770/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1771/// 0x13: Unordered (signaling) \n
1772/// 0x14: Not-equal (unordered, signaling) \n
1773/// 0x15: Not-less-than (unordered, non-signaling) \n
1774/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1775/// 0x17: Ordered (signaling) \n
1776/// 0x18: Equal (unordered, signaling) \n
1777/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1778/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1779/// 0x1B: False (ordered, signaling) \n
1780/// 0x1C: Not-equal (ordered, signaling) \n
1781/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1782/// 0x1E: Greater-than (ordered, non-signaling) \n
1783/// 0x1F: True (unordered, signaling)
1784/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1785#define _mm256_cmp_pd(a, b, c) \
1786 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1787 (__v4df)(__m256d)(b), (c)))
1788
1789/// Compares each of the corresponding values of two 256-bit vectors of
1790/// [8 x float], using the operation specified by the immediate integer
1791/// operand.
1792///
1793/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1794/// If either value in a comparison is NaN, comparisons that are ordered
1795/// return false, and comparisons that are unordered return true.
1796///
1797/// \headerfile <x86intrin.h>
1798///
1799/// \code
1800/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1801/// \endcode
1802///
1803/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1804///
1805/// \param a
1806/// A 256-bit vector of [8 x float].
1807/// \param b
1808/// A 256-bit vector of [8 x float].
1809/// \param c
1810/// An immediate integer operand, with bits [4:0] specifying which comparison
1811/// operation to use: \n
1812/// 0x00: Equal (ordered, non-signaling) \n
1813/// 0x01: Less-than (ordered, signaling) \n
1814/// 0x02: Less-than-or-equal (ordered, signaling) \n
1815/// 0x03: Unordered (non-signaling) \n
1816/// 0x04: Not-equal (unordered, non-signaling) \n
1817/// 0x05: Not-less-than (unordered, signaling) \n
1818/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1819/// 0x07: Ordered (non-signaling) \n
1820/// 0x08: Equal (unordered, non-signaling) \n
1821/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1822/// 0x0A: Not-greater-than (unordered, signaling) \n
1823/// 0x0B: False (ordered, non-signaling) \n
1824/// 0x0C: Not-equal (ordered, non-signaling) \n
1825/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1826/// 0x0E: Greater-than (ordered, signaling) \n
1827/// 0x0F: True (unordered, non-signaling) \n
1828/// 0x10: Equal (ordered, signaling) \n
1829/// 0x11: Less-than (ordered, non-signaling) \n
1830/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1831/// 0x13: Unordered (signaling) \n
1832/// 0x14: Not-equal (unordered, signaling) \n
1833/// 0x15: Not-less-than (unordered, non-signaling) \n
1834/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1835/// 0x17: Ordered (signaling) \n
1836/// 0x18: Equal (unordered, signaling) \n
1837/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1838/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1839/// 0x1B: False (ordered, signaling) \n
1840/// 0x1C: Not-equal (ordered, signaling) \n
1841/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1842/// 0x1E: Greater-than (ordered, non-signaling) \n
1843/// 0x1F: True (unordered, signaling)
1844/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1845#define _mm256_cmp_ps(a, b, c) \
1846 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1847 (__v8sf)(__m256)(b), (c)))
1848
1849/* Below intrinsic defined in emmintrin.h can be used for AVX */
1850/// Compares each of the corresponding scalar double-precision values of
1851/// two 128-bit vectors of [2 x double], using the operation specified by the
1852/// immediate integer operand.
1853///
1854/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1855/// If either value in a comparison is NaN, comparisons that are ordered
1856/// return false, and comparisons that are unordered return true.
1857///
1858/// \headerfile <x86intrin.h>
1859///
1860/// \code
1861/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1862/// \endcode
1863///
1864/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1865///
1866/// \param a
1867/// A 128-bit vector of [2 x double].
1868/// \param b
1869/// A 128-bit vector of [2 x double].
1870/// \param c
1871/// An immediate integer operand, with bits [4:0] specifying which comparison
1872/// operation to use: \n
1873/// 0x00: Equal (ordered, non-signaling) \n
1874/// 0x01: Less-than (ordered, signaling) \n
1875/// 0x02: Less-than-or-equal (ordered, signaling) \n
1876/// 0x03: Unordered (non-signaling) \n
1877/// 0x04: Not-equal (unordered, non-signaling) \n
1878/// 0x05: Not-less-than (unordered, signaling) \n
1879/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1880/// 0x07: Ordered (non-signaling) \n
1881/// 0x08: Equal (unordered, non-signaling) \n
1882/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1883/// 0x0A: Not-greater-than (unordered, signaling) \n
1884/// 0x0B: False (ordered, non-signaling) \n
1885/// 0x0C: Not-equal (ordered, non-signaling) \n
1886/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1887/// 0x0E: Greater-than (ordered, signaling) \n
1888/// 0x0F: True (unordered, non-signaling) \n
1889/// 0x10: Equal (ordered, signaling) \n
1890/// 0x11: Less-than (ordered, non-signaling) \n
1891/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1892/// 0x13: Unordered (signaling) \n
1893/// 0x14: Not-equal (unordered, signaling) \n
1894/// 0x15: Not-less-than (unordered, non-signaling) \n
1895/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1896/// 0x17: Ordered (signaling) \n
1897/// 0x18: Equal (unordered, signaling) \n
1898/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1899/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1900/// 0x1B: False (ordered, signaling) \n
1901/// 0x1C: Not-equal (ordered, signaling) \n
1902/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1903/// 0x1E: Greater-than (ordered, non-signaling) \n
1904/// 0x1F: True (unordered, signaling)
1905/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1906/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1907
1908/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1909/// Compares each of the corresponding scalar values of two 128-bit
1910/// vectors of [4 x float], using the operation specified by the immediate
1911/// integer operand.
1912///
1913/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1914/// If either value in a comparison is NaN, comparisons that are ordered
1915/// return false, and comparisons that are unordered return true.
1916///
1917/// \headerfile <x86intrin.h>
1918///
1919/// \code
1920/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1921/// \endcode
1922///
1923/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1924///
1925/// \param a
1926/// A 128-bit vector of [4 x float].
1927/// \param b
1928/// A 128-bit vector of [4 x float].
1929/// \param c
1930/// An immediate integer operand, with bits [4:0] specifying which comparison
1931/// operation to use: \n
1932/// 0x00: Equal (ordered, non-signaling) \n
1933/// 0x01: Less-than (ordered, signaling) \n
1934/// 0x02: Less-than-or-equal (ordered, signaling) \n
1935/// 0x03: Unordered (non-signaling) \n
1936/// 0x04: Not-equal (unordered, non-signaling) \n
1937/// 0x05: Not-less-than (unordered, signaling) \n
1938/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1939/// 0x07: Ordered (non-signaling) \n
1940/// 0x08: Equal (unordered, non-signaling) \n
1941/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1942/// 0x0A: Not-greater-than (unordered, signaling) \n
1943/// 0x0B: False (ordered, non-signaling) \n
1944/// 0x0C: Not-equal (ordered, non-signaling) \n
1945/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1946/// 0x0E: Greater-than (ordered, signaling) \n
1947/// 0x0F: True (unordered, non-signaling) \n
1948/// 0x10: Equal (ordered, signaling) \n
1949/// 0x11: Less-than (ordered, non-signaling) \n
1950/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1951/// 0x13: Unordered (signaling) \n
1952/// 0x14: Not-equal (unordered, signaling) \n
1953/// 0x15: Not-less-than (unordered, non-signaling) \n
1954/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1955/// 0x17: Ordered (signaling) \n
1956/// 0x18: Equal (unordered, signaling) \n
1957/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1958/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1959/// 0x1B: False (ordered, signaling) \n
1960/// 0x1C: Not-equal (ordered, signaling) \n
1961/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1962/// 0x1E: Greater-than (ordered, non-signaling) \n
1963/// 0x1F: True (unordered, signaling)
1964/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1965/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1966
1967/// Takes a [8 x i32] vector and returns the vector element value
1968/// indexed by the immediate constant operand.
1969///
1970/// \headerfile <x86intrin.h>
1971///
1972/// \code
1973/// int _mm256_extract_epi32(__m256i X, const int N);
1974/// \endcode
1975///
1976/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1977/// instruction.
1978///
1979/// \param X
1980/// A 256-bit vector of [8 x i32].
1981/// \param N
1982/// An immediate integer operand with bits [2:0] determining which vector
1983/// element is extracted and returned.
1984/// \returns A 32-bit integer containing the extracted 32 bits of extended
1985/// packed data.
1986#define _mm256_extract_epi32(X, N) \
1987 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1988
1989/// Takes a [16 x i16] vector and returns the vector element value
1990/// indexed by the immediate constant operand.
1991///
1992/// \headerfile <x86intrin.h>
1993///
1994/// \code
1995/// int _mm256_extract_epi16(__m256i X, const int N);
1996/// \endcode
1997///
1998/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1999/// instruction.
2000///
2001/// \param X
2002/// A 256-bit integer vector of [16 x i16].
2003/// \param N
2004/// An immediate integer operand with bits [3:0] determining which vector
2005/// element is extracted and returned.
2006/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2007/// packed data.
2008#define _mm256_extract_epi16(X, N) \
2009 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2010 (int)(N)))
2011
2012/// Takes a [32 x i8] vector and returns the vector element value
2013/// indexed by the immediate constant operand.
2014///
2015/// \headerfile <x86intrin.h>
2016///
2017/// \code
2018/// int _mm256_extract_epi8(__m256i X, const int N);
2019/// \endcode
2020///
2021/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2022/// instruction.
2023///
2024/// \param X
2025/// A 256-bit integer vector of [32 x i8].
2026/// \param N
2027/// An immediate integer operand with bits [4:0] determining which vector
2028/// element is extracted and returned.
2029/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2030/// packed data.
2031#define _mm256_extract_epi8(X, N) \
2032 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2033 (int)(N)))
2034
2035#ifdef __x86_64__
2036/// Takes a [4 x i64] vector and returns the vector element value
2037/// indexed by the immediate constant operand.
2038///
2039/// \headerfile <x86intrin.h>
2040///
2041/// \code
2042/// long long _mm256_extract_epi64(__m256i X, const int N);
2043/// \endcode
2044///
2045/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2046/// instruction.
2047///
2048/// \param X
2049/// A 256-bit integer vector of [4 x i64].
2050/// \param N
2051/// An immediate integer operand with bits [1:0] determining which vector
2052/// element is extracted and returned.
2053/// \returns A 64-bit integer containing the extracted 64 bits of extended
2054/// packed data.
2055#define _mm256_extract_epi64(X, N) \
2056 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2057#endif
2058
2059/// Takes a [8 x i32] vector and replaces the vector element value
2060/// indexed by the immediate constant operand by a new value. Returns the
2061/// modified vector.
2062///
2063/// \headerfile <x86intrin.h>
2064///
2065/// \code
2066/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2067/// \endcode
2068///
2069/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2070/// instruction.
2071///
2072/// \param X
2073/// A vector of [8 x i32] to be used by the insert operation.
2074/// \param I
2075/// An integer value. The replacement value for the insert operation.
2076/// \param N
2077/// An immediate integer specifying the index of the vector element to be
2078/// replaced.
2079/// \returns A copy of vector \a X, after replacing its element indexed by
2080/// \a N with \a I.
2081#define _mm256_insert_epi32(X, I, N) \
2082 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2083 (int)(I), (int)(N)))
2084
2085
2086/// Takes a [16 x i16] vector and replaces the vector element value
2087/// indexed by the immediate constant operand with a new value. Returns the
2088/// modified vector.
2089///
2090/// \headerfile <x86intrin.h>
2091///
2092/// \code
2093/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2094/// \endcode
2095///
2096/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2097/// instruction.
2098///
2099/// \param X
2100/// A vector of [16 x i16] to be used by the insert operation.
2101/// \param I
2102/// An i16 integer value. The replacement value for the insert operation.
2103/// \param N
2104/// An immediate integer specifying the index of the vector element to be
2105/// replaced.
2106/// \returns A copy of vector \a X, after replacing its element indexed by
2107/// \a N with \a I.
2108#define _mm256_insert_epi16(X, I, N) \
2109 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2110 (int)(I), (int)(N)))
2111
2112/// Takes a [32 x i8] vector and replaces the vector element value
2113/// indexed by the immediate constant operand with a new value. Returns the
2114/// modified vector.
2115///
2116/// \headerfile <x86intrin.h>
2117///
2118/// \code
2119/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2120/// \endcode
2121///
2122/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2123/// instruction.
2124///
2125/// \param X
2126/// A vector of [32 x i8] to be used by the insert operation.
2127/// \param I
2128/// An i8 integer value. The replacement value for the insert operation.
2129/// \param N
2130/// An immediate integer specifying the index of the vector element to be
2131/// replaced.
2132/// \returns A copy of vector \a X, after replacing its element indexed by
2133/// \a N with \a I.
2134#define _mm256_insert_epi8(X, I, N) \
2135 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2136 (int)(I), (int)(N)))
2137
2138#ifdef __x86_64__
2139/// Takes a [4 x i64] vector and replaces the vector element value
2140/// indexed by the immediate constant operand with a new value. Returns the
2141/// modified vector.
2142///
2143/// \headerfile <x86intrin.h>
2144///
2145/// \code
2146/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2147/// \endcode
2148///
2149/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2150/// instruction.
2151///
2152/// \param X
2153/// A vector of [4 x i64] to be used by the insert operation.
2154/// \param I
2155/// A 64-bit integer value. The replacement value for the insert operation.
2156/// \param N
2157/// An immediate integer specifying the index of the vector element to be
2158/// replaced.
2159/// \returns A copy of vector \a X, after replacing its element indexed by
2160/// \a N with \a I.
2161#define _mm256_insert_epi64(X, I, N) \
2162 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2163 (long long)(I), (int)(N)))
2164#endif
2165
2166/* Conversion */
2167/// Converts a vector of [4 x i32] into a vector of [4 x double].
2168///
2169/// \headerfile <x86intrin.h>
2170///
2171/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2172///
2173/// \param __a
2174/// A 128-bit integer vector of [4 x i32].
2175/// \returns A 256-bit vector of [4 x double] containing the converted values.
2176static __inline __m256d __DEFAULT_FN_ATTRS
2178{
2179 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2180}
2181
2182/// Converts a vector of [8 x i32] into a vector of [8 x float].
2183///
2184/// \headerfile <x86intrin.h>
2185///
2186/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2187///
2188/// \param __a
2189/// A 256-bit integer vector.
2190/// \returns A 256-bit vector of [8 x float] containing the converted values.
2191static __inline __m256 __DEFAULT_FN_ATTRS
2193{
2194 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2195}
2196
2197/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2198/// [4 x float].
2199///
2200/// \headerfile <x86intrin.h>
2201///
2202/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2203///
2204/// \param __a
2205/// A 256-bit vector of [4 x double].
2206/// \returns A 128-bit vector of [4 x float] containing the converted values.
2207static __inline __m128 __DEFAULT_FN_ATTRS
2209{
2210 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2211}
2212
2213/// Converts a vector of [8 x float] into a vector of [8 x i32].
2214///
2215/// If a converted value does not fit in a 32-bit integer, raises a
2216/// floating-point invalid exception. If the exception is masked, returns
2217/// the most negative integer.
2218///
2219/// \headerfile <x86intrin.h>
2220///
2221/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2222///
2223/// \param __a
2224/// A 256-bit vector of [8 x float].
2225/// \returns A 256-bit integer vector containing the converted values.
2226static __inline __m256i __DEFAULT_FN_ATTRS
2228{
2229 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2230}
2231
2232/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2233/// x double].
2234///
2235/// \headerfile <x86intrin.h>
2236///
2237/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2238///
2239/// \param __a
2240/// A 128-bit vector of [4 x float].
2241/// \returns A 256-bit vector of [4 x double] containing the converted values.
2242static __inline __m256d __DEFAULT_FN_ATTRS
2244{
2245 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2246}
2247
2248/// Converts a 256-bit vector of [4 x double] into four signed truncated
2249/// (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2250/// [4 x i32].
2251///
2252/// If a converted value does not fit in a 32-bit integer, raises a
2253/// floating-point invalid exception. If the exception is masked, returns
2254/// the most negative integer.
2255///
2256/// \headerfile <x86intrin.h>
2257///
2258/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2259///
2260/// \param __a
2261/// A 256-bit vector of [4 x double].
2262/// \returns A 128-bit integer vector containing the converted values.
2263static __inline __m128i __DEFAULT_FN_ATTRS
2265{
2266 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2267}
2268
2269/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2270/// [4 x i32].
2271///
2272/// If a converted value does not fit in a 32-bit integer, raises a
2273/// floating-point invalid exception. If the exception is masked, returns
2274/// the most negative integer.
2275///
2276/// \headerfile <x86intrin.h>
2277///
2278/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2279///
2280/// \param __a
2281/// A 256-bit vector of [4 x double].
2282/// \returns A 128-bit integer vector containing the converted values.
2283static __inline __m128i __DEFAULT_FN_ATTRS
2285{
2286 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2287}
2288
2289/// Converts a vector of [8 x float] into eight signed truncated (rounded
2290/// toward zero) 32-bit integers returned in a vector of [8 x i32].
2291///
2292/// If a converted value does not fit in a 32-bit integer, raises a
2293/// floating-point invalid exception. If the exception is masked, returns
2294/// the most negative integer.
2295///
2296/// \headerfile <x86intrin.h>
2297///
2298/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2299///
2300/// \param __a
2301/// A 256-bit vector of [8 x float].
2302/// \returns A 256-bit integer vector containing the converted values.
2303static __inline __m256i __DEFAULT_FN_ATTRS
2305{
2306 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2307}
2308
2309/// Returns the first element of the input vector of [4 x double].
2310///
2311/// \headerfile <x86intrin.h>
2312///
2313/// This intrinsic is a utility function and does not correspond to a specific
2314/// instruction.
2315///
2316/// \param __a
2317/// A 256-bit vector of [4 x double].
2318/// \returns A 64 bit double containing the first element of the input vector.
2319static __inline double __DEFAULT_FN_ATTRS
2321{
2322 return __a[0];
2323}
2324
2325/// Returns the first element of the input vector of [8 x i32].
2326///
2327/// \headerfile <x86intrin.h>
2328///
2329/// This intrinsic is a utility function and does not correspond to a specific
2330/// instruction.
2331///
2332/// \param __a
2333/// A 256-bit vector of [8 x i32].
2334/// \returns A 32 bit integer containing the first element of the input vector.
2335static __inline int __DEFAULT_FN_ATTRS
2337{
2338 __v8si __b = (__v8si)__a;
2339 return __b[0];
2340}
2341
2342/// Returns the first element of the input vector of [8 x float].
2343///
2344/// \headerfile <x86intrin.h>
2345///
2346/// This intrinsic is a utility function and does not correspond to a specific
2347/// instruction.
2348///
2349/// \param __a
2350/// A 256-bit vector of [8 x float].
2351/// \returns A 32 bit float containing the first element of the input vector.
2352static __inline float __DEFAULT_FN_ATTRS
2354{
2355 return __a[0];
2356}
2357
2358/* Vector replicate */
2359/// Moves and duplicates odd-indexed values from a 256-bit vector of
2360/// [8 x float] to float values in a 256-bit vector of [8 x float].
2361///
2362/// \headerfile <x86intrin.h>
2363///
2364/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2365///
2366/// \param __a
2367/// A 256-bit vector of [8 x float]. \n
2368/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2369/// the return value. \n
2370/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2371/// the return value. \n
2372/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2373/// return value. \n
2374/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2375/// return value.
2376/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2377/// values.
2378static __inline __m256 __DEFAULT_FN_ATTRS
2380{
2381 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2382}
2383
2384/// Moves and duplicates even-indexed values from a 256-bit vector of
2385/// [8 x float] to float values in a 256-bit vector of [8 x float].
2386///
2387/// \headerfile <x86intrin.h>
2388///
2389/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2390///
2391/// \param __a
2392/// A 256-bit vector of [8 x float]. \n
2393/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2394/// the return value. \n
2395/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2396/// the return value. \n
2397/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2398/// return value. \n
2399/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2400/// return value.
2401/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2402/// values.
2403static __inline __m256 __DEFAULT_FN_ATTRS
2405{
2406 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2407}
2408
2409/// Moves and duplicates double-precision floating point values from a
2410/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2411/// vector of [4 x double].
2412///
2413/// \headerfile <x86intrin.h>
2414///
2415/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2416///
2417/// \param __a
2418/// A 256-bit vector of [4 x double]. \n
2419/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2420/// return value. \n
2421/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2422/// the return value.
2423/// \returns A 256-bit vector of [4 x double] containing the moved and
2424/// duplicated values.
2425static __inline __m256d __DEFAULT_FN_ATTRS
2427{
2428 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2429}
2430
2431/* Unpack and Interleave */
2432/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2433/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2434///
2435/// \headerfile <x86intrin.h>
2436///
2437/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2438///
2439/// \param __a
2440/// A 256-bit floating-point vector of [4 x double]. \n
2441/// Bits [127:64] are written to bits [63:0] of the return value. \n
2442/// Bits [255:192] are written to bits [191:128] of the return value. \n
2443/// \param __b
2444/// A 256-bit floating-point vector of [4 x double]. \n
2445/// Bits [127:64] are written to bits [127:64] of the return value. \n
2446/// Bits [255:192] are written to bits [255:192] of the return value. \n
2447/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2448static __inline __m256d __DEFAULT_FN_ATTRS
2449_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2450{
2451 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2452}
2453
2454/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2455/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2456///
2457/// \headerfile <x86intrin.h>
2458///
2459/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2460///
2461/// \param __a
2462/// A 256-bit floating-point vector of [4 x double]. \n
2463/// Bits [63:0] are written to bits [63:0] of the return value. \n
2464/// Bits [191:128] are written to bits [191:128] of the return value.
2465/// \param __b
2466/// A 256-bit floating-point vector of [4 x double]. \n
2467/// Bits [63:0] are written to bits [127:64] of the return value. \n
2468/// Bits [191:128] are written to bits [255:192] of the return value. \n
2469/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2470static __inline __m256d __DEFAULT_FN_ATTRS
2471_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2472{
2473 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2474}
2475
2476/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2477/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2478/// vector of [8 x float].
2479///
2480/// \headerfile <x86intrin.h>
2481///
2482/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2483///
2484/// \param __a
2485/// A 256-bit vector of [8 x float]. \n
2486/// Bits [95:64] are written to bits [31:0] of the return value. \n
2487/// Bits [127:96] are written to bits [95:64] of the return value. \n
2488/// Bits [223:192] are written to bits [159:128] of the return value. \n
2489/// Bits [255:224] are written to bits [223:192] of the return value.
2490/// \param __b
2491/// A 256-bit vector of [8 x float]. \n
2492/// Bits [95:64] are written to bits [63:32] of the return value. \n
2493/// Bits [127:96] are written to bits [127:96] of the return value. \n
2494/// Bits [223:192] are written to bits [191:160] of the return value. \n
2495/// Bits [255:224] are written to bits [255:224] of the return value.
2496/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2497static __inline __m256 __DEFAULT_FN_ATTRS
2499{
2500 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2501}
2502
2503/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2504/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2505/// vector of [8 x float].
2506///
2507/// \headerfile <x86intrin.h>
2508///
2509/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2510///
2511/// \param __a
2512/// A 256-bit vector of [8 x float]. \n
2513/// Bits [31:0] are written to bits [31:0] of the return value. \n
2514/// Bits [63:32] are written to bits [95:64] of the return value. \n
2515/// Bits [159:128] are written to bits [159:128] of the return value. \n
2516/// Bits [191:160] are written to bits [223:192] of the return value.
2517/// \param __b
2518/// A 256-bit vector of [8 x float]. \n
2519/// Bits [31:0] are written to bits [63:32] of the return value. \n
2520/// Bits [63:32] are written to bits [127:96] of the return value. \n
2521/// Bits [159:128] are written to bits [191:160] of the return value. \n
2522/// Bits [191:160] are written to bits [255:224] of the return value.
2523/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2524static __inline __m256 __DEFAULT_FN_ATTRS
2526{
2527 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2528}
2529
2530/* Bit Test */
2531/// Given two 128-bit floating-point vectors of [2 x double], perform an
2532/// element-by-element comparison of the double-precision element in the
2533/// first source vector and the corresponding element in the second source
2534/// vector.
2535///
2536/// The EFLAGS register is updated as follows: \n
2537/// If there is at least one pair of double-precision elements where the
2538/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2539/// ZF flag is set to 1. \n
2540/// If there is at least one pair of double-precision elements where the
2541/// sign-bit of the first element is 0 and the sign-bit of the second element
2542/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2543/// This intrinsic returns the value of the ZF flag.
2544///
2545/// \headerfile <x86intrin.h>
2546///
2547/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2548///
2549/// \param __a
2550/// A 128-bit vector of [2 x double].
2551/// \param __b
2552/// A 128-bit vector of [2 x double].
2553/// \returns the ZF flag in the EFLAGS register.
2554static __inline int __DEFAULT_FN_ATTRS128
2555_mm_testz_pd(__m128d __a, __m128d __b)
2556{
2557 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2558}
2559
2560/// Given two 128-bit floating-point vectors of [2 x double], perform an
2561/// element-by-element comparison of the double-precision element in the
2562/// first source vector and the corresponding element in the second source
2563/// vector.
2564///
2565/// The EFLAGS register is updated as follows: \n
2566/// If there is at least one pair of double-precision elements where the
2567/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2568/// ZF flag is set to 1. \n
2569/// If there is at least one pair of double-precision elements where the
2570/// sign-bit of the first element is 0 and the sign-bit of the second element
2571/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2572/// This intrinsic returns the value of the CF flag.
2573///
2574/// \headerfile <x86intrin.h>
2575///
2576/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2577///
2578/// \param __a
2579/// A 128-bit vector of [2 x double].
2580/// \param __b
2581/// A 128-bit vector of [2 x double].
2582/// \returns the CF flag in the EFLAGS register.
2583static __inline int __DEFAULT_FN_ATTRS128
2584_mm_testc_pd(__m128d __a, __m128d __b)
2585{
2586 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2587}
2588
2589/// Given two 128-bit floating-point vectors of [2 x double], perform an
2590/// element-by-element comparison of the double-precision element in the
2591/// first source vector and the corresponding element in the second source
2592/// vector.
2593///
2594/// The EFLAGS register is updated as follows: \n
2595/// If there is at least one pair of double-precision elements where the
2596/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2597/// ZF flag is set to 1. \n
2598/// If there is at least one pair of double-precision elements where the
2599/// sign-bit of the first element is 0 and the sign-bit of the second element
2600/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2601/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2602/// otherwise it returns 0.
2603///
2604/// \headerfile <x86intrin.h>
2605///
2606/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2607///
2608/// \param __a
2609/// A 128-bit vector of [2 x double].
2610/// \param __b
2611/// A 128-bit vector of [2 x double].
2612/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2613static __inline int __DEFAULT_FN_ATTRS128
2614_mm_testnzc_pd(__m128d __a, __m128d __b)
2615{
2616 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2617}
2618
2619/// Given two 128-bit floating-point vectors of [4 x float], perform an
2620/// element-by-element comparison of the single-precision element in the
2621/// first source vector and the corresponding element in the second source
2622/// vector.
2623///
2624/// The EFLAGS register is updated as follows: \n
2625/// If there is at least one pair of single-precision elements where the
2626/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2627/// ZF flag is set to 1. \n
2628/// If there is at least one pair of single-precision elements where the
2629/// sign-bit of the first element is 0 and the sign-bit of the second element
2630/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2631/// This intrinsic returns the value of the ZF flag.
2632///
2633/// \headerfile <x86intrin.h>
2634///
2635/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2636///
2637/// \param __a
2638/// A 128-bit vector of [4 x float].
2639/// \param __b
2640/// A 128-bit vector of [4 x float].
2641/// \returns the ZF flag.
2642static __inline int __DEFAULT_FN_ATTRS128
2643_mm_testz_ps(__m128 __a, __m128 __b)
2644{
2645 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2646}
2647
2648/// Given two 128-bit floating-point vectors of [4 x float], perform an
2649/// element-by-element comparison of the single-precision element in the
2650/// first source vector and the corresponding element in the second source
2651/// vector.
2652///
2653/// The EFLAGS register is updated as follows: \n
2654/// If there is at least one pair of single-precision elements where the
2655/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2656/// ZF flag is set to 1. \n
2657/// If there is at least one pair of single-precision elements where the
2658/// sign-bit of the first element is 0 and the sign-bit of the second element
2659/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2660/// This intrinsic returns the value of the CF flag.
2661///
2662/// \headerfile <x86intrin.h>
2663///
2664/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2665///
2666/// \param __a
2667/// A 128-bit vector of [4 x float].
2668/// \param __b
2669/// A 128-bit vector of [4 x float].
2670/// \returns the CF flag.
2671static __inline int __DEFAULT_FN_ATTRS128
2672_mm_testc_ps(__m128 __a, __m128 __b)
2673{
2674 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2675}
2676
2677/// Given two 128-bit floating-point vectors of [4 x float], perform an
2678/// element-by-element comparison of the single-precision element in the
2679/// first source vector and the corresponding element in the second source
2680/// vector.
2681///
2682/// The EFLAGS register is updated as follows: \n
2683/// If there is at least one pair of single-precision elements where the
2684/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2685/// ZF flag is set to 1. \n
2686/// If there is at least one pair of single-precision elements where the
2687/// sign-bit of the first element is 0 and the sign-bit of the second element
2688/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2689/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2690/// otherwise it returns 0.
2691///
2692/// \headerfile <x86intrin.h>
2693///
2694/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2695///
2696/// \param __a
2697/// A 128-bit vector of [4 x float].
2698/// \param __b
2699/// A 128-bit vector of [4 x float].
2700/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2701static __inline int __DEFAULT_FN_ATTRS128
2702_mm_testnzc_ps(__m128 __a, __m128 __b)
2703{
2704 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2705}
2706
2707/// Given two 256-bit floating-point vectors of [4 x double], perform an
2708/// element-by-element comparison of the double-precision elements in the
2709/// first source vector and the corresponding elements in the second source
2710/// vector.
2711///
2712/// The EFLAGS register is updated as follows: \n
2713/// If there is at least one pair of double-precision elements where the
2714/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2715/// ZF flag is set to 1. \n
2716/// If there is at least one pair of double-precision elements where the
2717/// sign-bit of the first element is 0 and the sign-bit of the second element
2718/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2719/// This intrinsic returns the value of the ZF flag.
2720///
2721/// \headerfile <x86intrin.h>
2722///
2723/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2724///
2725/// \param __a
2726/// A 256-bit vector of [4 x double].
2727/// \param __b
2728/// A 256-bit vector of [4 x double].
2729/// \returns the ZF flag.
2730static __inline int __DEFAULT_FN_ATTRS
2731_mm256_testz_pd(__m256d __a, __m256d __b)
2732{
2733 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2734}
2735
2736/// Given two 256-bit floating-point vectors of [4 x double], perform an
2737/// element-by-element comparison of the double-precision elements in the
2738/// first source vector and the corresponding elements in the second source
2739/// vector.
2740///
2741/// The EFLAGS register is updated as follows: \n
2742/// If there is at least one pair of double-precision elements where the
2743/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2744/// ZF flag is set to 1. \n
2745/// If there is at least one pair of double-precision elements where the
2746/// sign-bit of the first element is 0 and the sign-bit of the second element
2747/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2748/// This intrinsic returns the value of the CF flag.
2749///
2750/// \headerfile <x86intrin.h>
2751///
2752/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2753///
2754/// \param __a
2755/// A 256-bit vector of [4 x double].
2756/// \param __b
2757/// A 256-bit vector of [4 x double].
2758/// \returns the CF flag.
2759static __inline int __DEFAULT_FN_ATTRS
2760_mm256_testc_pd(__m256d __a, __m256d __b)
2761{
2762 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2763}
2764
2765/// Given two 256-bit floating-point vectors of [4 x double], perform an
2766/// element-by-element comparison of the double-precision elements in the
2767/// first source vector and the corresponding elements in the second source
2768/// vector.
2769///
2770/// The EFLAGS register is updated as follows: \n
2771/// If there is at least one pair of double-precision elements where the
2772/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2773/// ZF flag is set to 1. \n
2774/// If there is at least one pair of double-precision elements where the
2775/// sign-bit of the first element is 0 and the sign-bit of the second element
2776/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2777/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2778/// otherwise it returns 0.
2779///
2780/// \headerfile <x86intrin.h>
2781///
2782/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2783///
2784/// \param __a
2785/// A 256-bit vector of [4 x double].
2786/// \param __b
2787/// A 256-bit vector of [4 x double].
2788/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2789static __inline int __DEFAULT_FN_ATTRS
2790_mm256_testnzc_pd(__m256d __a, __m256d __b)
2791{
2792 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2793}
2794
2795/// Given two 256-bit floating-point vectors of [8 x float], perform an
2796/// element-by-element comparison of the single-precision element in the
2797/// first source vector and the corresponding element in the second source
2798/// vector.
2799///
2800/// The EFLAGS register is updated as follows: \n
2801/// If there is at least one pair of single-precision elements where the
2802/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2803/// ZF flag is set to 1. \n
2804/// If there is at least one pair of single-precision elements where the
2805/// sign-bit of the first element is 0 and the sign-bit of the second element
2806/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2807/// This intrinsic returns the value of the ZF flag.
2808///
2809/// \headerfile <x86intrin.h>
2810///
2811/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2812///
2813/// \param __a
2814/// A 256-bit vector of [8 x float].
2815/// \param __b
2816/// A 256-bit vector of [8 x float].
2817/// \returns the ZF flag.
2818static __inline int __DEFAULT_FN_ATTRS
2819_mm256_testz_ps(__m256 __a, __m256 __b)
2820{
2821 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2822}
2823
2824/// Given two 256-bit floating-point vectors of [8 x float], perform an
2825/// element-by-element comparison of the single-precision element in the
2826/// first source vector and the corresponding element in the second source
2827/// vector.
2828///
2829/// The EFLAGS register is updated as follows: \n
2830/// If there is at least one pair of single-precision elements where the
2831/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2832/// ZF flag is set to 1. \n
2833/// If there is at least one pair of single-precision elements where the
2834/// sign-bit of the first element is 0 and the sign-bit of the second element
2835/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2836/// This intrinsic returns the value of the CF flag.
2837///
2838/// \headerfile <x86intrin.h>
2839///
2840/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2841///
2842/// \param __a
2843/// A 256-bit vector of [8 x float].
2844/// \param __b
2845/// A 256-bit vector of [8 x float].
2846/// \returns the CF flag.
2847static __inline int __DEFAULT_FN_ATTRS
2848_mm256_testc_ps(__m256 __a, __m256 __b)
2849{
2850 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2851}
2852
2853/// Given two 256-bit floating-point vectors of [8 x float], perform an
2854/// element-by-element comparison of the single-precision elements in the
2855/// first source vector and the corresponding elements in the second source
2856/// vector.
2857///
2858/// The EFLAGS register is updated as follows: \n
2859/// If there is at least one pair of single-precision elements where the
2860/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2861/// ZF flag is set to 1. \n
2862/// If there is at least one pair of single-precision elements where the
2863/// sign-bit of the first element is 0 and the sign-bit of the second element
2864/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2865/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2866/// otherwise it returns 0.
2867///
2868/// \headerfile <x86intrin.h>
2869///
2870/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2871///
2872/// \param __a
2873/// A 256-bit vector of [8 x float].
2874/// \param __b
2875/// A 256-bit vector of [8 x float].
2876/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2877static __inline int __DEFAULT_FN_ATTRS
2879{
2880 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2881}
2882
2883/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2884/// of the two source vectors.
2885///
2886/// The EFLAGS register is updated as follows: \n
2887/// If there is at least one pair of bits where both bits are 1, the ZF flag
2888/// is set to 0. Otherwise the ZF flag is set to 1. \n
2889/// If there is at least one pair of bits where the bit from the first source
2890/// vector is 0 and the bit from the second source vector is 1, the CF flag
2891/// is set to 0. Otherwise the CF flag is set to 1. \n
2892/// This intrinsic returns the value of the ZF flag.
2893///
2894/// \headerfile <x86intrin.h>
2895///
2896/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2897///
2898/// \param __a
2899/// A 256-bit integer vector.
2900/// \param __b
2901/// A 256-bit integer vector.
2902/// \returns the ZF flag.
2903static __inline int __DEFAULT_FN_ATTRS
2904_mm256_testz_si256(__m256i __a, __m256i __b)
2905{
2906 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2907}
2908
2909/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2910/// of the two source vectors.
2911///
2912/// The EFLAGS register is updated as follows: \n
2913/// If there is at least one pair of bits where both bits are 1, the ZF flag
2914/// is set to 0. Otherwise the ZF flag is set to 1. \n
2915/// If there is at least one pair of bits where the bit from the first source
2916/// vector is 0 and the bit from the second source vector is 1, the CF flag
2917/// is set to 0. Otherwise the CF flag is set to 1. \n
2918/// This intrinsic returns the value of the CF flag.
2919///
2920/// \headerfile <x86intrin.h>
2921///
2922/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2923///
2924/// \param __a
2925/// A 256-bit integer vector.
2926/// \param __b
2927/// A 256-bit integer vector.
2928/// \returns the CF flag.
2929static __inline int __DEFAULT_FN_ATTRS
2930_mm256_testc_si256(__m256i __a, __m256i __b)
2931{
2932 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2933}
2934
2935/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2936/// of the two source vectors.
2937///
2938/// The EFLAGS register is updated as follows: \n
2939/// If there is at least one pair of bits where both bits are 1, the ZF flag
2940/// is set to 0. Otherwise the ZF flag is set to 1. \n
2941/// If there is at least one pair of bits where the bit from the first source
2942/// vector is 0 and the bit from the second source vector is 1, the CF flag
2943/// is set to 0. Otherwise the CF flag is set to 1. \n
2944/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2945/// otherwise it returns 0.
2946///
2947/// \headerfile <x86intrin.h>
2948///
2949/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2950///
2951/// \param __a
2952/// A 256-bit integer vector.
2953/// \param __b
2954/// A 256-bit integer vector.
2955/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2956static __inline int __DEFAULT_FN_ATTRS
2958{
2959 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2960}
2961
2962/* Vector extract sign mask */
2963/// Extracts the sign bits of double-precision floating point elements
2964/// in a 256-bit vector of [4 x double] and writes them to the lower order
2965/// bits of the return value.
2966///
2967/// \headerfile <x86intrin.h>
2968///
2969/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2970///
2971/// \param __a
2972/// A 256-bit vector of [4 x double] containing the double-precision
2973/// floating point values with sign bits to be extracted.
2974/// \returns The sign bits from the operand, written to bits [3:0].
2975static __inline int __DEFAULT_FN_ATTRS
2977{
2978 return __builtin_ia32_movmskpd256((__v4df)__a);
2979}
2980
2981/// Extracts the sign bits of single-precision floating point elements
2982/// in a 256-bit vector of [8 x float] and writes them to the lower order
2983/// bits of the return value.
2984///
2985/// \headerfile <x86intrin.h>
2986///
2987/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2988///
2989/// \param __a
2990/// A 256-bit vector of [8 x float] containing the single-precision floating
2991/// point values with sign bits to be extracted.
2992/// \returns The sign bits from the operand, written to bits [7:0].
2993static __inline int __DEFAULT_FN_ATTRS
2995{
2996 return __builtin_ia32_movmskps256((__v8sf)__a);
2997}
2998
2999/* Vector __zero */
3000/// Zeroes the contents of all XMM or YMM registers.
3001///
3002/// \headerfile <x86intrin.h>
3003///
3004/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
3005static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3006_mm256_zeroall(void)
3007{
3008 __builtin_ia32_vzeroall();
3009}
3010
3011/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
3012///
3013/// \headerfile <x86intrin.h>
3014///
3015/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3016static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3017_mm256_zeroupper(void)
3018{
3019 __builtin_ia32_vzeroupper();
3020}
3021
3022/* Vector load with broadcast */
3023/// Loads a scalar single-precision floating point value from the
3024/// specified address pointed to by \a __a and broadcasts it to the elements
3025/// of a [4 x float] vector.
3026///
3027/// \headerfile <x86intrin.h>
3028///
3029/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3030///
3031/// \param __a
3032/// The single-precision floating point value to be broadcast.
3033/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3034/// equal to the broadcast value.
3035static __inline __m128 __DEFAULT_FN_ATTRS128
3037{
3038 struct __mm_broadcast_ss_struct {
3039 float __f;
3040 } __attribute__((__packed__, __may_alias__));
3041 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3042 return __extension__ (__m128){ __f, __f, __f, __f };
3043}
3044
3045/// Loads a scalar double-precision floating point value from the
3046/// specified address pointed to by \a __a and broadcasts it to the elements
3047/// of a [4 x double] vector.
3048///
3049/// \headerfile <x86intrin.h>
3050///
3051/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3052///
3053/// \param __a
3054/// The double-precision floating point value to be broadcast.
3055/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3056/// equal to the broadcast value.
3057static __inline __m256d __DEFAULT_FN_ATTRS
3059{
3060 struct __mm256_broadcast_sd_struct {
3061 double __d;
3062 } __attribute__((__packed__, __may_alias__));
3063 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3064 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3065}
3066
3067/// Loads a scalar single-precision floating point value from the
3068/// specified address pointed to by \a __a and broadcasts it to the elements
3069/// of a [8 x float] vector.
3070///
3071/// \headerfile <x86intrin.h>
3072///
3073/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3074///
3075/// \param __a
3076/// The single-precision floating point value to be broadcast.
3077/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3078/// equal to the broadcast value.
3079static __inline __m256 __DEFAULT_FN_ATTRS
3081{
3082 struct __mm256_broadcast_ss_struct {
3083 float __f;
3084 } __attribute__((__packed__, __may_alias__));
3085 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3086 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3087}
3088
3089/// Loads the data from a 128-bit vector of [2 x double] from the
3090/// specified address pointed to by \a __a and broadcasts it to 128-bit
3091/// elements in a 256-bit vector of [4 x double].
3092///
3093/// \headerfile <x86intrin.h>
3094///
3095/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3096///
3097/// \param __a
3098/// The 128-bit vector of [2 x double] to be broadcast.
3099/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3100/// equal to the broadcast value.
3101static __inline __m256d __DEFAULT_FN_ATTRS
3103{
3104 __m128d __b = _mm_loadu_pd((const double *)__a);
3105 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3106 0, 1, 0, 1);
3107}
3108
3109/// Loads the data from a 128-bit vector of [4 x float] from the
3110/// specified address pointed to by \a __a and broadcasts it to 128-bit
3111/// elements in a 256-bit vector of [8 x float].
3112///
3113/// \headerfile <x86intrin.h>
3114///
3115/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3116///
3117/// \param __a
3118/// The 128-bit vector of [4 x float] to be broadcast.
3119/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3120/// equal to the broadcast value.
3121static __inline __m256 __DEFAULT_FN_ATTRS
3123{
3124 __m128 __b = _mm_loadu_ps((const float *)__a);
3125 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3126 0, 1, 2, 3, 0, 1, 2, 3);
3127}
3128
3129/* SIMD load ops */
3130/// Loads 4 double-precision floating point values from a 32-byte aligned
3131/// memory location pointed to by \a __p into a vector of [4 x double].
3132///
3133/// \headerfile <x86intrin.h>
3134///
3135/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3136///
3137/// \param __p
3138/// A 32-byte aligned pointer to a memory location containing
3139/// double-precision floating point values.
3140/// \returns A 256-bit vector of [4 x double] containing the moved values.
3141static __inline __m256d __DEFAULT_FN_ATTRS
3142_mm256_load_pd(double const *__p)
3143{
3144 return *(const __m256d *)__p;
3145}
3146
3147/// Loads 8 single-precision floating point values from a 32-byte aligned
3148/// memory location pointed to by \a __p into a vector of [8 x float].
3149///
3150/// \headerfile <x86intrin.h>
3151///
3152/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3153///
3154/// \param __p
3155/// A 32-byte aligned pointer to a memory location containing float values.
3156/// \returns A 256-bit vector of [8 x float] containing the moved values.
3157static __inline __m256 __DEFAULT_FN_ATTRS
3158_mm256_load_ps(float const *__p)
3159{
3160 return *(const __m256 *)__p;
3161}
3162
3163/// Loads 4 double-precision floating point values from an unaligned
3164/// memory location pointed to by \a __p into a vector of [4 x double].
3165///
3166/// \headerfile <x86intrin.h>
3167///
3168/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3169///
3170/// \param __p
3171/// A pointer to a memory location containing double-precision floating
3172/// point values.
3173/// \returns A 256-bit vector of [4 x double] containing the moved values.
3174static __inline __m256d __DEFAULT_FN_ATTRS
3175_mm256_loadu_pd(double const *__p)
3176{
3177 struct __loadu_pd {
3178 __m256d_u __v;
3179 } __attribute__((__packed__, __may_alias__));
3180 return ((const struct __loadu_pd*)__p)->__v;
3181}
3182
3183/// Loads 8 single-precision floating point values from an unaligned
3184/// memory location pointed to by \a __p into a vector of [8 x float].
3185///
3186/// \headerfile <x86intrin.h>
3187///
3188/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3189///
3190/// \param __p
3191/// A pointer to a memory location containing single-precision floating
3192/// point values.
3193/// \returns A 256-bit vector of [8 x float] containing the moved values.
3194static __inline __m256 __DEFAULT_FN_ATTRS
3196{
3197 struct __loadu_ps {
3198 __m256_u __v;
3199 } __attribute__((__packed__, __may_alias__));
3200 return ((const struct __loadu_ps*)__p)->__v;
3201}
3202
3203/// Loads 256 bits of integer data from a 32-byte aligned memory
3204/// location pointed to by \a __p into elements of a 256-bit integer vector.
3205///
3206/// \headerfile <x86intrin.h>
3207///
3208/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3209///
3210/// \param __p
3211/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3212/// values.
3213/// \returns A 256-bit integer vector containing the moved values.
3214static __inline __m256i __DEFAULT_FN_ATTRS
3215_mm256_load_si256(__m256i const *__p)
3216{
3217 return *__p;
3218}
3219
3220/// Loads 256 bits of integer data from an unaligned memory location
3221/// pointed to by \a __p into a 256-bit integer vector.
3222///
3223/// \headerfile <x86intrin.h>
3224///
3225/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3226///
3227/// \param __p
3228/// A pointer to a 256-bit integer vector containing integer values.
3229/// \returns A 256-bit integer vector containing the moved values.
3230static __inline __m256i __DEFAULT_FN_ATTRS
3231_mm256_loadu_si256(__m256i_u const *__p)
3232{
3233 struct __loadu_si256 {
3234 __m256i_u __v;
3235 } __attribute__((__packed__, __may_alias__));
3236 return ((const struct __loadu_si256*)__p)->__v;
3237}
3238
3239/// Loads 256 bits of integer data from an unaligned memory location
3240/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3241/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3242/// line boundary.
3243///
3244/// \headerfile <x86intrin.h>
3245///
3246/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3247///
3248/// \param __p
3249/// A pointer to a 256-bit integer vector containing integer values.
3250/// \returns A 256-bit integer vector containing the moved values.
3251static __inline __m256i __DEFAULT_FN_ATTRS
3252_mm256_lddqu_si256(__m256i_u const *__p)
3253{
3254 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3255}
3256
3257/* SIMD store ops */
3258/// Stores double-precision floating point values from a 256-bit vector
3259/// of [4 x double] to a 32-byte aligned memory location pointed to by
3260/// \a __p.
3261///
3262/// \headerfile <x86intrin.h>
3263///
3264/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3265///
3266/// \param __p
3267/// A 32-byte aligned pointer to a memory location that will receive the
3268/// double-precision floaing point values.
3269/// \param __a
3270/// A 256-bit vector of [4 x double] containing the values to be moved.
3271static __inline void __DEFAULT_FN_ATTRS
3272_mm256_store_pd(double *__p, __m256d __a)
3273{
3274 *(__m256d *)__p = __a;
3275}
3276
3277/// Stores single-precision floating point values from a 256-bit vector
3278/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3279///
3280/// \headerfile <x86intrin.h>
3281///
3282/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3283///
3284/// \param __p
3285/// A 32-byte aligned pointer to a memory location that will receive the
3286/// float values.
3287/// \param __a
3288/// A 256-bit vector of [8 x float] containing the values to be moved.
3289static __inline void __DEFAULT_FN_ATTRS
3290_mm256_store_ps(float *__p, __m256 __a)
3291{
3292 *(__m256 *)__p = __a;
3293}
3294
3295/// Stores double-precision floating point values from a 256-bit vector
3296/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3297///
3298/// \headerfile <x86intrin.h>
3299///
3300/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3301///
3302/// \param __p
3303/// A pointer to a memory location that will receive the double-precision
3304/// floating point values.
3305/// \param __a
3306/// A 256-bit vector of [4 x double] containing the values to be moved.
3307static __inline void __DEFAULT_FN_ATTRS
3308_mm256_storeu_pd(double *__p, __m256d __a)
3309{
3310 struct __storeu_pd {
3311 __m256d_u __v;
3312 } __attribute__((__packed__, __may_alias__));
3313 ((struct __storeu_pd*)__p)->__v = __a;
3314}
3315
3316/// Stores single-precision floating point values from a 256-bit vector
3317/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3318///
3319/// \headerfile <x86intrin.h>
3320///
3321/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3322///
3323/// \param __p
3324/// A pointer to a memory location that will receive the float values.
3325/// \param __a
3326/// A 256-bit vector of [8 x float] containing the values to be moved.
3327static __inline void __DEFAULT_FN_ATTRS
3328_mm256_storeu_ps(float *__p, __m256 __a)
3329{
3330 struct __storeu_ps {
3331 __m256_u __v;
3332 } __attribute__((__packed__, __may_alias__));
3333 ((struct __storeu_ps*)__p)->__v = __a;
3334}
3335
3336/// Stores integer values from a 256-bit integer vector to a 32-byte
3337/// aligned memory location pointed to by \a __p.
3338///
3339/// \headerfile <x86intrin.h>
3340///
3341/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3342///
3343/// \param __p
3344/// A 32-byte aligned pointer to a memory location that will receive the
3345/// integer values.
3346/// \param __a
3347/// A 256-bit integer vector containing the values to be moved.
3348static __inline void __DEFAULT_FN_ATTRS
3349_mm256_store_si256(__m256i *__p, __m256i __a)
3350{
3351 *__p = __a;
3352}
3353
3354/// Stores integer values from a 256-bit integer vector to an unaligned
3355/// memory location pointed to by \a __p.
3356///
3357/// \headerfile <x86intrin.h>
3358///
3359/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3360///
3361/// \param __p
3362/// A pointer to a memory location that will receive the integer values.
3363/// \param __a
3364/// A 256-bit integer vector containing the values to be moved.
3365static __inline void __DEFAULT_FN_ATTRS
3366_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3367{
3368 struct __storeu_si256 {
3369 __m256i_u __v;
3370 } __attribute__((__packed__, __may_alias__));
3371 ((struct __storeu_si256*)__p)->__v = __a;
3372}
3373
3374/* Conditional load ops */
3375/// Conditionally loads double-precision floating point elements from a
3376/// memory location pointed to by \a __p into a 128-bit vector of
3377/// [2 x double], depending on the mask bits associated with each data
3378/// element.
3379///
3380/// \headerfile <x86intrin.h>
3381///
3382/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3383///
3384/// \param __p
3385/// A pointer to a memory location that contains the double-precision
3386/// floating point values.
3387/// \param __m
3388/// A 128-bit integer vector containing the mask. The most significant bit of
3389/// each data element represents the mask bits. If a mask bit is zero, the
3390/// corresponding value in the memory location is not loaded and the
3391/// corresponding field in the return value is set to zero.
3392/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3393static __inline __m128d __DEFAULT_FN_ATTRS128
3394_mm_maskload_pd(double const *__p, __m128i __m)
3395{
3396 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3397}
3398
3399/// Conditionally loads double-precision floating point elements from a
3400/// memory location pointed to by \a __p into a 256-bit vector of
3401/// [4 x double], depending on the mask bits associated with each data
3402/// element.
3403///
3404/// \headerfile <x86intrin.h>
3405///
3406/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3407///
3408/// \param __p
3409/// A pointer to a memory location that contains the double-precision
3410/// floating point values.
3411/// \param __m
3412/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3413/// significant bit of each quadword element represents the mask bits. If a
3414/// mask bit is zero, the corresponding value in the memory location is not
3415/// loaded and the corresponding field in the return value is set to zero.
3416/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3417static __inline __m256d __DEFAULT_FN_ATTRS
3418_mm256_maskload_pd(double const *__p, __m256i __m)
3419{
3420 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3421 (__v4di)__m);
3422}
3423
3424/// Conditionally loads single-precision floating point elements from a
3425/// memory location pointed to by \a __p into a 128-bit vector of
3426/// [4 x float], depending on the mask bits associated with each data
3427/// element.
3428///
3429/// \headerfile <x86intrin.h>
3430///
3431/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3432///
3433/// \param __p
3434/// A pointer to a memory location that contains the single-precision
3435/// floating point values.
3436/// \param __m
3437/// A 128-bit integer vector containing the mask. The most significant bit of
3438/// each data element represents the mask bits. If a mask bit is zero, the
3439/// corresponding value in the memory location is not loaded and the
3440/// corresponding field in the return value is set to zero.
3441/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3442static __inline __m128 __DEFAULT_FN_ATTRS128
3443_mm_maskload_ps(float const *__p, __m128i __m)
3444{
3445 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3446}
3447
3448/// Conditionally loads single-precision floating point elements from a
3449/// memory location pointed to by \a __p into a 256-bit vector of
3450/// [8 x float], depending on the mask bits associated with each data
3451/// element.
3452///
3453/// \headerfile <x86intrin.h>
3454///
3455/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3456///
3457/// \param __p
3458/// A pointer to a memory location that contains the single-precision
3459/// floating point values.
3460/// \param __m
3461/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3462/// significant bit of each dword element represents the mask bits. If a mask
3463/// bit is zero, the corresponding value in the memory location is not loaded
3464/// and the corresponding field in the return value is set to zero.
3465/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3466static __inline __m256 __DEFAULT_FN_ATTRS
3467_mm256_maskload_ps(float const *__p, __m256i __m)
3468{
3469 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3470}
3471
3472/* Conditional store ops */
3473/// Moves single-precision floating point values from a 256-bit vector
3474/// of [8 x float] to a memory location pointed to by \a __p, according to
3475/// the specified mask.
3476///
3477/// \headerfile <x86intrin.h>
3478///
3479/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3480///
3481/// \param __p
3482/// A pointer to a memory location that will receive the float values.
3483/// \param __m
3484/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3485/// significant bit of each dword element in the mask vector represents the
3486/// mask bits. If a mask bit is zero, the corresponding value from vector
3487/// \a __a is not stored and the corresponding field in the memory location
3488/// pointed to by \a __p is not changed.
3489/// \param __a
3490/// A 256-bit vector of [8 x float] containing the values to be stored.
3491static __inline void __DEFAULT_FN_ATTRS
3492_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3493{
3494 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3495}
3496
3497/// Moves double-precision values from a 128-bit vector of [2 x double]
3498/// to a memory location pointed to by \a __p, according to the specified
3499/// mask.
3500///
3501/// \headerfile <x86intrin.h>
3502///
3503/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3504///
3505/// \param __p
3506/// A pointer to a memory location that will receive the float values.
3507/// \param __m
3508/// A 128-bit integer vector containing the mask. The most significant bit of
3509/// each field in the mask vector represents the mask bits. If a mask bit is
3510/// zero, the corresponding value from vector \a __a is not stored and the
3511/// corresponding field in the memory location pointed to by \a __p is not
3512/// changed.
3513/// \param __a
3514/// A 128-bit vector of [2 x double] containing the values to be stored.
3515static __inline void __DEFAULT_FN_ATTRS128
3516_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3517{
3518 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3519}
3520
3521/// Moves double-precision values from a 256-bit vector of [4 x double]
3522/// to a memory location pointed to by \a __p, according to the specified
3523/// mask.
3524///
3525/// \headerfile <x86intrin.h>
3526///
3527/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3528///
3529/// \param __p
3530/// A pointer to a memory location that will receive the float values.
3531/// \param __m
3532/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3533/// significant bit of each quadword element in the mask vector represents
3534/// the mask bits. If a mask bit is zero, the corresponding value from vector
3535/// __a is not stored and the corresponding field in the memory location
3536/// pointed to by \a __p is not changed.
3537/// \param __a
3538/// A 256-bit vector of [4 x double] containing the values to be stored.
3539static __inline void __DEFAULT_FN_ATTRS
3540_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3541{
3542 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3543}
3544
3545/// Moves single-precision floating point values from a 128-bit vector
3546/// of [4 x float] to a memory location pointed to by \a __p, according to
3547/// the specified mask.
3548///
3549/// \headerfile <x86intrin.h>
3550///
3551/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3552///
3553/// \param __p
3554/// A pointer to a memory location that will receive the float values.
3555/// \param __m
3556/// A 128-bit integer vector containing the mask. The most significant bit of
3557/// each field in the mask vector represents the mask bits. If a mask bit is
3558/// zero, the corresponding value from vector __a is not stored and the
3559/// corresponding field in the memory location pointed to by \a __p is not
3560/// changed.
3561/// \param __a
3562/// A 128-bit vector of [4 x float] containing the values to be stored.
3563static __inline void __DEFAULT_FN_ATTRS128
3564_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3565{
3566 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3567}
3568
3569/* Cacheability support ops */
3570/// Moves integer data from a 256-bit integer vector to a 32-byte
3571/// aligned memory location. To minimize caching, the data is flagged as
3572/// non-temporal (unlikely to be used again soon).
3573///
3574/// \headerfile <x86intrin.h>
3575///
3576/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3577///
3578/// \param __a
3579/// A pointer to a 32-byte aligned memory location that will receive the
3580/// integer values.
3581/// \param __b
3582/// A 256-bit integer vector containing the values to be moved.
3583static __inline void __DEFAULT_FN_ATTRS
3585{
3586 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3587 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3588}
3589
3590/// Moves double-precision values from a 256-bit vector of [4 x double]
3591/// to a 32-byte aligned memory location. To minimize caching, the data is
3592/// flagged as non-temporal (unlikely to be used again soon).
3593///
3594/// \headerfile <x86intrin.h>
3595///
3596/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3597///
3598/// \param __a
3599/// A pointer to a 32-byte aligned memory location that will receive the
3600/// double-precision floating-point values.
3601/// \param __b
3602/// A 256-bit vector of [4 x double] containing the values to be moved.
3603static __inline void __DEFAULT_FN_ATTRS
3604_mm256_stream_pd(void *__a, __m256d __b)
3605{
3606 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3607 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3608}
3609
3610/// Moves single-precision floating point values from a 256-bit vector
3611/// of [8 x float] to a 32-byte aligned memory location. To minimize
3612/// caching, the data is flagged as non-temporal (unlikely to be used again
3613/// soon).
3614///
3615/// \headerfile <x86intrin.h>
3616///
3617/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3618///
3619/// \param __p
3620/// A pointer to a 32-byte aligned memory location that will receive the
3621/// single-precision floating point values.
3622/// \param __a
3623/// A 256-bit vector of [8 x float] containing the values to be moved.
3624static __inline void __DEFAULT_FN_ATTRS
3626{
3627 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3628 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3629}
3630
3631/* Create vectors */
3632/// Create a 256-bit vector of [4 x double] with undefined values.
3633///
3634/// \headerfile <x86intrin.h>
3635///
3636/// This intrinsic has no corresponding instruction.
3637///
3638/// \returns A 256-bit vector of [4 x double] containing undefined values.
3639static __inline__ __m256d __DEFAULT_FN_ATTRS
3641{
3642 return (__m256d)__builtin_ia32_undef256();
3643}
3644
3645/// Create a 256-bit vector of [8 x float] with undefined values.
3646///
3647/// \headerfile <x86intrin.h>
3648///
3649/// This intrinsic has no corresponding instruction.
3650///
3651/// \returns A 256-bit vector of [8 x float] containing undefined values.
3652static __inline__ __m256 __DEFAULT_FN_ATTRS
3654{
3655 return (__m256)__builtin_ia32_undef256();
3656}
3657
3658/// Create a 256-bit integer vector with undefined values.
3659///
3660/// \headerfile <x86intrin.h>
3661///
3662/// This intrinsic has no corresponding instruction.
3663///
3664/// \returns A 256-bit integer vector containing undefined values.
3665static __inline__ __m256i __DEFAULT_FN_ATTRS
3667{
3668 return (__m256i)__builtin_ia32_undef256();
3669}
3670
3671/// Constructs a 256-bit floating-point vector of [4 x double]
3672/// initialized with the specified double-precision floating-point values.
3673///
3674/// \headerfile <x86intrin.h>
3675///
3676/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3677/// instruction.
3678///
3679/// \param __a
3680/// A double-precision floating-point value used to initialize bits [255:192]
3681/// of the result.
3682/// \param __b
3683/// A double-precision floating-point value used to initialize bits [191:128]
3684/// of the result.
3685/// \param __c
3686/// A double-precision floating-point value used to initialize bits [127:64]
3687/// of the result.
3688/// \param __d
3689/// A double-precision floating-point value used to initialize bits [63:0]
3690/// of the result.
3691/// \returns An initialized 256-bit floating-point vector of [4 x double].
3692static __inline __m256d __DEFAULT_FN_ATTRS
3693_mm256_set_pd(double __a, double __b, double __c, double __d)
3694{
3695 return __extension__ (__m256d){ __d, __c, __b, __a };
3696}
3697
3698/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3699/// with the specified single-precision floating-point values.
3700///
3701/// \headerfile <x86intrin.h>
3702///
3703/// This intrinsic is a utility function and does not correspond to a specific
3704/// instruction.
3705///
3706/// \param __a
3707/// A single-precision floating-point value used to initialize bits [255:224]
3708/// of the result.
3709/// \param __b
3710/// A single-precision floating-point value used to initialize bits [223:192]
3711/// of the result.
3712/// \param __c
3713/// A single-precision floating-point value used to initialize bits [191:160]
3714/// of the result.
3715/// \param __d
3716/// A single-precision floating-point value used to initialize bits [159:128]
3717/// of the result.
3718/// \param __e
3719/// A single-precision floating-point value used to initialize bits [127:96]
3720/// of the result.
3721/// \param __f
3722/// A single-precision floating-point value used to initialize bits [95:64]
3723/// of the result.
3724/// \param __g
3725/// A single-precision floating-point value used to initialize bits [63:32]
3726/// of the result.
3727/// \param __h
3728/// A single-precision floating-point value used to initialize bits [31:0]
3729/// of the result.
3730/// \returns An initialized 256-bit floating-point vector of [8 x float].
3731static __inline __m256 __DEFAULT_FN_ATTRS
3732_mm256_set_ps(float __a, float __b, float __c, float __d,
3733 float __e, float __f, float __g, float __h)
3734{
3735 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3736}
3737
3738/// Constructs a 256-bit integer vector initialized with the specified
3739/// 32-bit integral values.
3740///
3741/// \headerfile <x86intrin.h>
3742///
3743/// This intrinsic is a utility function and does not correspond to a specific
3744/// instruction.
3745///
3746/// \param __i0
3747/// A 32-bit integral value used to initialize bits [255:224] of the result.
3748/// \param __i1
3749/// A 32-bit integral value used to initialize bits [223:192] of the result.
3750/// \param __i2
3751/// A 32-bit integral value used to initialize bits [191:160] of the result.
3752/// \param __i3
3753/// A 32-bit integral value used to initialize bits [159:128] of the result.
3754/// \param __i4
3755/// A 32-bit integral value used to initialize bits [127:96] of the result.
3756/// \param __i5
3757/// A 32-bit integral value used to initialize bits [95:64] of the result.
3758/// \param __i6
3759/// A 32-bit integral value used to initialize bits [63:32] of the result.
3760/// \param __i7
3761/// A 32-bit integral value used to initialize bits [31:0] of the result.
3762/// \returns An initialized 256-bit integer vector.
3763static __inline __m256i __DEFAULT_FN_ATTRS
3764_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3765 int __i4, int __i5, int __i6, int __i7)
3766{
3767 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3768}
3769
3770/// Constructs a 256-bit integer vector initialized with the specified
3771/// 16-bit integral values.
3772///
3773/// \headerfile <x86intrin.h>
3774///
3775/// This intrinsic is a utility function and does not correspond to a specific
3776/// instruction.
3777///
3778/// \param __w15
3779/// A 16-bit integral value used to initialize bits [255:240] of the result.
3780/// \param __w14
3781/// A 16-bit integral value used to initialize bits [239:224] of the result.
3782/// \param __w13
3783/// A 16-bit integral value used to initialize bits [223:208] of the result.
3784/// \param __w12
3785/// A 16-bit integral value used to initialize bits [207:192] of the result.
3786/// \param __w11
3787/// A 16-bit integral value used to initialize bits [191:176] of the result.
3788/// \param __w10
3789/// A 16-bit integral value used to initialize bits [175:160] of the result.
3790/// \param __w09
3791/// A 16-bit integral value used to initialize bits [159:144] of the result.
3792/// \param __w08
3793/// A 16-bit integral value used to initialize bits [143:128] of the result.
3794/// \param __w07
3795/// A 16-bit integral value used to initialize bits [127:112] of the result.
3796/// \param __w06
3797/// A 16-bit integral value used to initialize bits [111:96] of the result.
3798/// \param __w05
3799/// A 16-bit integral value used to initialize bits [95:80] of the result.
3800/// \param __w04
3801/// A 16-bit integral value used to initialize bits [79:64] of the result.
3802/// \param __w03
3803/// A 16-bit integral value used to initialize bits [63:48] of the result.
3804/// \param __w02
3805/// A 16-bit integral value used to initialize bits [47:32] of the result.
3806/// \param __w01
3807/// A 16-bit integral value used to initialize bits [31:16] of the result.
3808/// \param __w00
3809/// A 16-bit integral value used to initialize bits [15:0] of the result.
3810/// \returns An initialized 256-bit integer vector.
3811static __inline __m256i __DEFAULT_FN_ATTRS
3812_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3813 short __w11, short __w10, short __w09, short __w08,
3814 short __w07, short __w06, short __w05, short __w04,
3815 short __w03, short __w02, short __w01, short __w00)
3816{
3817 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3818 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3819}
3820
3821/// Constructs a 256-bit integer vector initialized with the specified
3822/// 8-bit integral values.
3823///
3824/// \headerfile <x86intrin.h>
3825///
3826/// This intrinsic is a utility function and does not correspond to a specific
3827/// instruction.
3828///
3829/// \param __b31
3830/// An 8-bit integral value used to initialize bits [255:248] of the result.
3831/// \param __b30
3832/// An 8-bit integral value used to initialize bits [247:240] of the result.
3833/// \param __b29
3834/// An 8-bit integral value used to initialize bits [239:232] of the result.
3835/// \param __b28
3836/// An 8-bit integral value used to initialize bits [231:224] of the result.
3837/// \param __b27
3838/// An 8-bit integral value used to initialize bits [223:216] of the result.
3839/// \param __b26
3840/// An 8-bit integral value used to initialize bits [215:208] of the result.
3841/// \param __b25
3842/// An 8-bit integral value used to initialize bits [207:200] of the result.
3843/// \param __b24
3844/// An 8-bit integral value used to initialize bits [199:192] of the result.
3845/// \param __b23
3846/// An 8-bit integral value used to initialize bits [191:184] of the result.
3847/// \param __b22
3848/// An 8-bit integral value used to initialize bits [183:176] of the result.
3849/// \param __b21
3850/// An 8-bit integral value used to initialize bits [175:168] of the result.
3851/// \param __b20
3852/// An 8-bit integral value used to initialize bits [167:160] of the result.
3853/// \param __b19
3854/// An 8-bit integral value used to initialize bits [159:152] of the result.
3855/// \param __b18
3856/// An 8-bit integral value used to initialize bits [151:144] of the result.
3857/// \param __b17
3858/// An 8-bit integral value used to initialize bits [143:136] of the result.
3859/// \param __b16
3860/// An 8-bit integral value used to initialize bits [135:128] of the result.
3861/// \param __b15
3862/// An 8-bit integral value used to initialize bits [127:120] of the result.
3863/// \param __b14
3864/// An 8-bit integral value used to initialize bits [119:112] of the result.
3865/// \param __b13
3866/// An 8-bit integral value used to initialize bits [111:104] of the result.
3867/// \param __b12
3868/// An 8-bit integral value used to initialize bits [103:96] of the result.
3869/// \param __b11
3870/// An 8-bit integral value used to initialize bits [95:88] of the result.
3871/// \param __b10
3872/// An 8-bit integral value used to initialize bits [87:80] of the result.
3873/// \param __b09
3874/// An 8-bit integral value used to initialize bits [79:72] of the result.
3875/// \param __b08
3876/// An 8-bit integral value used to initialize bits [71:64] of the result.
3877/// \param __b07
3878/// An 8-bit integral value used to initialize bits [63:56] of the result.
3879/// \param __b06
3880/// An 8-bit integral value used to initialize bits [55:48] of the result.
3881/// \param __b05
3882/// An 8-bit integral value used to initialize bits [47:40] of the result.
3883/// \param __b04
3884/// An 8-bit integral value used to initialize bits [39:32] of the result.
3885/// \param __b03
3886/// An 8-bit integral value used to initialize bits [31:24] of the result.
3887/// \param __b02
3888/// An 8-bit integral value used to initialize bits [23:16] of the result.
3889/// \param __b01
3890/// An 8-bit integral value used to initialize bits [15:8] of the result.
3891/// \param __b00
3892/// An 8-bit integral value used to initialize bits [7:0] of the result.
3893/// \returns An initialized 256-bit integer vector.
3894static __inline __m256i __DEFAULT_FN_ATTRS
3895_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3896 char __b27, char __b26, char __b25, char __b24,
3897 char __b23, char __b22, char __b21, char __b20,
3898 char __b19, char __b18, char __b17, char __b16,
3899 char __b15, char __b14, char __b13, char __b12,
3900 char __b11, char __b10, char __b09, char __b08,
3901 char __b07, char __b06, char __b05, char __b04,
3902 char __b03, char __b02, char __b01, char __b00)
3903{
3904 return __extension__ (__m256i)(__v32qi){
3905 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3906 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3907 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3908 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3909 };
3910}
3911
3912/// Constructs a 256-bit integer vector initialized with the specified
3913/// 64-bit integral values.
3914///
3915/// \headerfile <x86intrin.h>
3916///
3917/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3918/// instruction.
3919///
3920/// \param __a
3921/// A 64-bit integral value used to initialize bits [255:192] of the result.
3922/// \param __b
3923/// A 64-bit integral value used to initialize bits [191:128] of the result.
3924/// \param __c
3925/// A 64-bit integral value used to initialize bits [127:64] of the result.
3926/// \param __d
3927/// A 64-bit integral value used to initialize bits [63:0] of the result.
3928/// \returns An initialized 256-bit integer vector.
3929static __inline __m256i __DEFAULT_FN_ATTRS
3930_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3931{
3932 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3933}
3934
3935/* Create vectors with elements in reverse order */
3936/// Constructs a 256-bit floating-point vector of [4 x double],
3937/// initialized in reverse order with the specified double-precision
3938/// floating-point values.
3939///
3940/// \headerfile <x86intrin.h>
3941///
3942/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3943/// instruction.
3944///
3945/// \param __a
3946/// A double-precision floating-point value used to initialize bits [63:0]
3947/// of the result.
3948/// \param __b
3949/// A double-precision floating-point value used to initialize bits [127:64]
3950/// of the result.
3951/// \param __c
3952/// A double-precision floating-point value used to initialize bits [191:128]
3953/// of the result.
3954/// \param __d
3955/// A double-precision floating-point value used to initialize bits [255:192]
3956/// of the result.
3957/// \returns An initialized 256-bit floating-point vector of [4 x double].
3958static __inline __m256d __DEFAULT_FN_ATTRS
3959_mm256_setr_pd(double __a, double __b, double __c, double __d)
3960{
3961 return _mm256_set_pd(__d, __c, __b, __a);
3962}
3963
3964/// Constructs a 256-bit floating-point vector of [8 x float],
3965/// initialized in reverse order with the specified single-precision
3966/// float-point values.
3967///
3968/// \headerfile <x86intrin.h>
3969///
3970/// This intrinsic is a utility function and does not correspond to a specific
3971/// instruction.
3972///
3973/// \param __a
3974/// A single-precision floating-point value used to initialize bits [31:0]
3975/// of the result.
3976/// \param __b
3977/// A single-precision floating-point value used to initialize bits [63:32]
3978/// of the result.
3979/// \param __c
3980/// A single-precision floating-point value used to initialize bits [95:64]
3981/// of the result.
3982/// \param __d
3983/// A single-precision floating-point value used to initialize bits [127:96]
3984/// of the result.
3985/// \param __e
3986/// A single-precision floating-point value used to initialize bits [159:128]
3987/// of the result.
3988/// \param __f
3989/// A single-precision floating-point value used to initialize bits [191:160]
3990/// of the result.
3991/// \param __g
3992/// A single-precision floating-point value used to initialize bits [223:192]
3993/// of the result.
3994/// \param __h
3995/// A single-precision floating-point value used to initialize bits [255:224]
3996/// of the result.
3997/// \returns An initialized 256-bit floating-point vector of [8 x float].
3998static __inline __m256 __DEFAULT_FN_ATTRS
3999_mm256_setr_ps(float __a, float __b, float __c, float __d,
4000 float __e, float __f, float __g, float __h)
4001{
4002 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
4003}
4004
4005/// Constructs a 256-bit integer vector, initialized in reverse order
4006/// with the specified 32-bit integral values.
4007///
4008/// \headerfile <x86intrin.h>
4009///
4010/// This intrinsic is a utility function and does not correspond to a specific
4011/// instruction.
4012///
4013/// \param __i0
4014/// A 32-bit integral value used to initialize bits [31:0] of the result.
4015/// \param __i1
4016/// A 32-bit integral value used to initialize bits [63:32] of the result.
4017/// \param __i2
4018/// A 32-bit integral value used to initialize bits [95:64] of the result.
4019/// \param __i3
4020/// A 32-bit integral value used to initialize bits [127:96] of the result.
4021/// \param __i4
4022/// A 32-bit integral value used to initialize bits [159:128] of the result.
4023/// \param __i5
4024/// A 32-bit integral value used to initialize bits [191:160] of the result.
4025/// \param __i6
4026/// A 32-bit integral value used to initialize bits [223:192] of the result.
4027/// \param __i7
4028/// A 32-bit integral value used to initialize bits [255:224] of the result.
4029/// \returns An initialized 256-bit integer vector.
4030static __inline __m256i __DEFAULT_FN_ATTRS
4031_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4032 int __i4, int __i5, int __i6, int __i7)
4033{
4034 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
4035}
4036
4037/// Constructs a 256-bit integer vector, initialized in reverse order
4038/// with the specified 16-bit integral values.
4039///
4040/// \headerfile <x86intrin.h>
4041///
4042/// This intrinsic is a utility function and does not correspond to a specific
4043/// instruction.
4044///
4045/// \param __w15
4046/// A 16-bit integral value used to initialize bits [15:0] of the result.
4047/// \param __w14
4048/// A 16-bit integral value used to initialize bits [31:16] of the result.
4049/// \param __w13
4050/// A 16-bit integral value used to initialize bits [47:32] of the result.
4051/// \param __w12
4052/// A 16-bit integral value used to initialize bits [63:48] of the result.
4053/// \param __w11
4054/// A 16-bit integral value used to initialize bits [79:64] of the result.
4055/// \param __w10
4056/// A 16-bit integral value used to initialize bits [95:80] of the result.
4057/// \param __w09
4058/// A 16-bit integral value used to initialize bits [111:96] of the result.
4059/// \param __w08
4060/// A 16-bit integral value used to initialize bits [127:112] of the result.
4061/// \param __w07
4062/// A 16-bit integral value used to initialize bits [143:128] of the result.
4063/// \param __w06
4064/// A 16-bit integral value used to initialize bits [159:144] of the result.
4065/// \param __w05
4066/// A 16-bit integral value used to initialize bits [175:160] of the result.
4067/// \param __w04
4068/// A 16-bit integral value used to initialize bits [191:176] of the result.
4069/// \param __w03
4070/// A 16-bit integral value used to initialize bits [207:192] of the result.
4071/// \param __w02
4072/// A 16-bit integral value used to initialize bits [223:208] of the result.
4073/// \param __w01
4074/// A 16-bit integral value used to initialize bits [239:224] of the result.
4075/// \param __w00
4076/// A 16-bit integral value used to initialize bits [255:240] of the result.
4077/// \returns An initialized 256-bit integer vector.
4078static __inline __m256i __DEFAULT_FN_ATTRS
4079_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4080 short __w11, short __w10, short __w09, short __w08,
4081 short __w07, short __w06, short __w05, short __w04,
4082 short __w03, short __w02, short __w01, short __w00)
4083{
4084 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4085 __w04, __w05, __w06, __w07,
4086 __w08, __w09, __w10, __w11,
4087 __w12, __w13, __w14, __w15);
4088}
4089
4090/// Constructs a 256-bit integer vector, initialized in reverse order
4091/// with the specified 8-bit integral values.
4092///
4093/// \headerfile <x86intrin.h>
4094///
4095/// This intrinsic is a utility function and does not correspond to a specific
4096/// instruction.
4097///
4098/// \param __b31
4099/// An 8-bit integral value used to initialize bits [7:0] of the result.
4100/// \param __b30
4101/// An 8-bit integral value used to initialize bits [15:8] of the result.
4102/// \param __b29
4103/// An 8-bit integral value used to initialize bits [23:16] of the result.
4104/// \param __b28
4105/// An 8-bit integral value used to initialize bits [31:24] of the result.
4106/// \param __b27
4107/// An 8-bit integral value used to initialize bits [39:32] of the result.
4108/// \param __b26
4109/// An 8-bit integral value used to initialize bits [47:40] of the result.
4110/// \param __b25
4111/// An 8-bit integral value used to initialize bits [55:48] of the result.
4112/// \param __b24
4113/// An 8-bit integral value used to initialize bits [63:56] of the result.
4114/// \param __b23
4115/// An 8-bit integral value used to initialize bits [71:64] of the result.
4116/// \param __b22
4117/// An 8-bit integral value used to initialize bits [79:72] of the result.
4118/// \param __b21
4119/// An 8-bit integral value used to initialize bits [87:80] of the result.
4120/// \param __b20
4121/// An 8-bit integral value used to initialize bits [95:88] of the result.
4122/// \param __b19
4123/// An 8-bit integral value used to initialize bits [103:96] of the result.
4124/// \param __b18
4125/// An 8-bit integral value used to initialize bits [111:104] of the result.
4126/// \param __b17
4127/// An 8-bit integral value used to initialize bits [119:112] of the result.
4128/// \param __b16
4129/// An 8-bit integral value used to initialize bits [127:120] of the result.
4130/// \param __b15
4131/// An 8-bit integral value used to initialize bits [135:128] of the result.
4132/// \param __b14
4133/// An 8-bit integral value used to initialize bits [143:136] of the result.
4134/// \param __b13
4135/// An 8-bit integral value used to initialize bits [151:144] of the result.
4136/// \param __b12
4137/// An 8-bit integral value used to initialize bits [159:152] of the result.
4138/// \param __b11
4139/// An 8-bit integral value used to initialize bits [167:160] of the result.
4140/// \param __b10
4141/// An 8-bit integral value used to initialize bits [175:168] of the result.
4142/// \param __b09
4143/// An 8-bit integral value used to initialize bits [183:176] of the result.
4144/// \param __b08
4145/// An 8-bit integral value used to initialize bits [191:184] of the result.
4146/// \param __b07
4147/// An 8-bit integral value used to initialize bits [199:192] of the result.
4148/// \param __b06
4149/// An 8-bit integral value used to initialize bits [207:200] of the result.
4150/// \param __b05
4151/// An 8-bit integral value used to initialize bits [215:208] of the result.
4152/// \param __b04
4153/// An 8-bit integral value used to initialize bits [223:216] of the result.
4154/// \param __b03
4155/// An 8-bit integral value used to initialize bits [231:224] of the result.
4156/// \param __b02
4157/// An 8-bit integral value used to initialize bits [239:232] of the result.
4158/// \param __b01
4159/// An 8-bit integral value used to initialize bits [247:240] of the result.
4160/// \param __b00
4161/// An 8-bit integral value used to initialize bits [255:248] of the result.
4162/// \returns An initialized 256-bit integer vector.
4163static __inline __m256i __DEFAULT_FN_ATTRS
4164_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4165 char __b27, char __b26, char __b25, char __b24,
4166 char __b23, char __b22, char __b21, char __b20,
4167 char __b19, char __b18, char __b17, char __b16,
4168 char __b15, char __b14, char __b13, char __b12,
4169 char __b11, char __b10, char __b09, char __b08,
4170 char __b07, char __b06, char __b05, char __b04,
4171 char __b03, char __b02, char __b01, char __b00)
4172{
4173 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4174 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4175 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4176 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4177}
4178
4179/// Constructs a 256-bit integer vector, initialized in reverse order
4180/// with the specified 64-bit integral values.
4181///
4182/// \headerfile <x86intrin.h>
4183///
4184/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4185/// instruction.
4186///
4187/// \param __a
4188/// A 64-bit integral value used to initialize bits [63:0] of the result.
4189/// \param __b
4190/// A 64-bit integral value used to initialize bits [127:64] of the result.
4191/// \param __c
4192/// A 64-bit integral value used to initialize bits [191:128] of the result.
4193/// \param __d
4194/// A 64-bit integral value used to initialize bits [255:192] of the result.
4195/// \returns An initialized 256-bit integer vector.
4196static __inline __m256i __DEFAULT_FN_ATTRS
4197_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4198{
4199 return _mm256_set_epi64x(__d, __c, __b, __a);
4200}
4201
4202/* Create vectors with repeated elements */
4203/// Constructs a 256-bit floating-point vector of [4 x double], with each
4204/// of the four double-precision floating-point vector elements set to the
4205/// specified double-precision floating-point value.
4206///
4207/// \headerfile <x86intrin.h>
4208///
4209/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4210///
4211/// \param __w
4212/// A double-precision floating-point value used to initialize each vector
4213/// element of the result.
4214/// \returns An initialized 256-bit floating-point vector of [4 x double].
4215static __inline __m256d __DEFAULT_FN_ATTRS
4217{
4218 return _mm256_set_pd(__w, __w, __w, __w);
4219}
4220
4221/// Constructs a 256-bit floating-point vector of [8 x float], with each
4222/// of the eight single-precision floating-point vector elements set to the
4223/// specified single-precision floating-point value.
4224///
4225/// \headerfile <x86intrin.h>
4226///
4227/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4228/// instruction.
4229///
4230/// \param __w
4231/// A single-precision floating-point value used to initialize each vector
4232/// element of the result.
4233/// \returns An initialized 256-bit floating-point vector of [8 x float].
4234static __inline __m256 __DEFAULT_FN_ATTRS
4236{
4237 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4238}
4239
4240/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4241/// 32-bit integral vector elements set to the specified 32-bit integral
4242/// value.
4243///
4244/// \headerfile <x86intrin.h>
4245///
4246/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4247/// instruction.
4248///
4249/// \param __i
4250/// A 32-bit integral value used to initialize each vector element of the
4251/// result.
4252/// \returns An initialized 256-bit integer vector of [8 x i32].
4253static __inline __m256i __DEFAULT_FN_ATTRS
4255{
4256 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4257}
4258
4259/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4260/// 16-bit integral vector elements set to the specified 16-bit integral
4261/// value.
4262///
4263/// \headerfile <x86intrin.h>
4264///
4265/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4266///
4267/// \param __w
4268/// A 16-bit integral value used to initialize each vector element of the
4269/// result.
4270/// \returns An initialized 256-bit integer vector of [16 x i16].
4271static __inline __m256i __DEFAULT_FN_ATTRS
4273{
4274 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4275 __w, __w, __w, __w, __w, __w, __w, __w);
4276}
4277
4278/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4279/// 8-bit integral vector elements set to the specified 8-bit integral value.
4280///
4281/// \headerfile <x86intrin.h>
4282///
4283/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4284///
4285/// \param __b
4286/// An 8-bit integral value used to initialize each vector element of the
4287/// result.
4288/// \returns An initialized 256-bit integer vector of [32 x i8].
4289static __inline __m256i __DEFAULT_FN_ATTRS
4291{
4292 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4293 __b, __b, __b, __b, __b, __b, __b, __b,
4294 __b, __b, __b, __b, __b, __b, __b, __b,
4295 __b, __b, __b, __b, __b, __b, __b, __b);
4296}
4297
4298/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4299/// 64-bit integral vector elements set to the specified 64-bit integral
4300/// value.
4301///
4302/// \headerfile <x86intrin.h>
4303///
4304/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4305///
4306/// \param __q
4307/// A 64-bit integral value used to initialize each vector element of the
4308/// result.
4309/// \returns An initialized 256-bit integer vector of [4 x i64].
4310static __inline __m256i __DEFAULT_FN_ATTRS
4312{
4313 return _mm256_set_epi64x(__q, __q, __q, __q);
4314}
4315
4316/* Create __zeroed vectors */
4317/// Constructs a 256-bit floating-point vector of [4 x double] with all
4318/// vector elements initialized to zero.
4319///
4320/// \headerfile <x86intrin.h>
4321///
4322/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4323///
4324/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4325static __inline __m256d __DEFAULT_FN_ATTRS
4327{
4328 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
4329}
4330
4331/// Constructs a 256-bit floating-point vector of [8 x float] with all
4332/// vector elements initialized to zero.
4333///
4334/// \headerfile <x86intrin.h>
4335///
4336/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4337///
4338/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4339static __inline __m256 __DEFAULT_FN_ATTRS
4341{
4342 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4343}
4344
4345/// Constructs a 256-bit integer vector initialized to zero.
4346///
4347/// \headerfile <x86intrin.h>
4348///
4349/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4350///
4351/// \returns A 256-bit integer vector initialized to zero.
4352static __inline __m256i __DEFAULT_FN_ATTRS
4354{
4355 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4356}
4357
4358/* Cast between vector types */
4359/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4360/// floating-point vector of [8 x float].
4361///
4362/// \headerfile <x86intrin.h>
4363///
4364/// This intrinsic has no corresponding instruction.
4365///
4366/// \param __a
4367/// A 256-bit floating-point vector of [4 x double].
4368/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4369/// bitwise pattern as the parameter.
4370static __inline __m256 __DEFAULT_FN_ATTRS
4372{
4373 return (__m256)__a;
4374}
4375
4376/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4377/// integer vector.
4378///
4379/// \headerfile <x86intrin.h>
4380///
4381/// This intrinsic has no corresponding instruction.
4382///
4383/// \param __a
4384/// A 256-bit floating-point vector of [4 x double].
4385/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4386/// parameter.
4387static __inline __m256i __DEFAULT_FN_ATTRS
4389{
4390 return (__m256i)__a;
4391}
4392
4393/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4394/// floating-point vector of [4 x double].
4395///
4396/// \headerfile <x86intrin.h>
4397///
4398/// This intrinsic has no corresponding instruction.
4399///
4400/// \param __a
4401/// A 256-bit floating-point vector of [8 x float].
4402/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4403/// bitwise pattern as the parameter.
4404static __inline __m256d __DEFAULT_FN_ATTRS
4406{
4407 return (__m256d)__a;
4408}
4409
4410/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4411/// integer vector.
4412///
4413/// \headerfile <x86intrin.h>
4414///
4415/// This intrinsic has no corresponding instruction.
4416///
4417/// \param __a
4418/// A 256-bit floating-point vector of [8 x float].
4419/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4420/// parameter.
4421static __inline __m256i __DEFAULT_FN_ATTRS
4423{
4424 return (__m256i)__a;
4425}
4426
4427/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4428/// of [8 x float].
4429///
4430/// \headerfile <x86intrin.h>
4431///
4432/// This intrinsic has no corresponding instruction.
4433///
4434/// \param __a
4435/// A 256-bit integer vector.
4436/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4437/// bitwise pattern as the parameter.
4438static __inline __m256 __DEFAULT_FN_ATTRS
4440{
4441 return (__m256)__a;
4442}
4443
4444/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4445/// of [4 x double].
4446///
4447/// \headerfile <x86intrin.h>
4448///
4449/// This intrinsic has no corresponding instruction.
4450///
4451/// \param __a
4452/// A 256-bit integer vector.
4453/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4454/// bitwise pattern as the parameter.
4455static __inline __m256d __DEFAULT_FN_ATTRS
4457{
4458 return (__m256d)__a;
4459}
4460
4461/// Returns the lower 128 bits of a 256-bit floating-point vector of
4462/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4463///
4464/// \headerfile <x86intrin.h>
4465///
4466/// This intrinsic has no corresponding instruction.
4467///
4468/// \param __a
4469/// A 256-bit floating-point vector of [4 x double].
4470/// \returns A 128-bit floating-point vector of [2 x double] containing the
4471/// lower 128 bits of the parameter.
4472static __inline __m128d __DEFAULT_FN_ATTRS
4474{
4475 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4476}
4477
4478/// Returns the lower 128 bits of a 256-bit floating-point vector of
4479/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4480///
4481/// \headerfile <x86intrin.h>
4482///
4483/// This intrinsic has no corresponding instruction.
4484///
4485/// \param __a
4486/// A 256-bit floating-point vector of [8 x float].
4487/// \returns A 128-bit floating-point vector of [4 x float] containing the
4488/// lower 128 bits of the parameter.
4489static __inline __m128 __DEFAULT_FN_ATTRS
4491{
4492 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4493}
4494
4495/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4496///
4497/// \headerfile <x86intrin.h>
4498///
4499/// This intrinsic has no corresponding instruction.
4500///
4501/// \param __a
4502/// A 256-bit integer vector.
4503/// \returns A 128-bit integer vector containing the lower 128 bits of the
4504/// parameter.
4505static __inline __m128i __DEFAULT_FN_ATTRS
4507{
4508 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4509}
4510
4511/// Constructs a 256-bit floating-point vector of [4 x double] from a
4512/// 128-bit floating-point vector of [2 x double].
4513///
4514/// The lower 128 bits contain the value of the source vector. The contents
4515/// of the upper 128 bits are undefined.
4516///
4517/// \headerfile <x86intrin.h>
4518///
4519/// This intrinsic has no corresponding instruction.
4520///
4521/// \param __a
4522/// A 128-bit vector of [2 x double].
4523/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4524/// contain the value of the parameter. The contents of the upper 128 bits
4525/// are undefined.
4526static __inline __m256d __DEFAULT_FN_ATTRS
4528{
4529 return __builtin_shufflevector(
4530 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4531}
4532
4533/// Constructs a 256-bit floating-point vector of [8 x float] from a
4534/// 128-bit floating-point vector of [4 x float].
4535///
4536/// The lower 128 bits contain the value of the source vector. The contents
4537/// of the upper 128 bits are undefined.
4538///
4539/// \headerfile <x86intrin.h>
4540///
4541/// This intrinsic has no corresponding instruction.
4542///
4543/// \param __a
4544/// A 128-bit vector of [4 x float].
4545/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4546/// contain the value of the parameter. The contents of the upper 128 bits
4547/// are undefined.
4548static __inline __m256 __DEFAULT_FN_ATTRS
4550{
4551 return __builtin_shufflevector((__v4sf)__a,
4552 (__v4sf)__builtin_nondeterministic_value(__a),
4553 0, 1, 2, 3, 4, 5, 6, 7);
4554}
4555
4556/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4557///
4558/// The lower 128 bits contain the value of the source vector. The contents
4559/// of the upper 128 bits are undefined.
4560///
4561/// \headerfile <x86intrin.h>
4562///
4563/// This intrinsic has no corresponding instruction.
4564///
4565/// \param __a
4566/// A 128-bit integer vector.
4567/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4568/// the parameter. The contents of the upper 128 bits are undefined.
4569static __inline __m256i __DEFAULT_FN_ATTRS
4571{
4572 return __builtin_shufflevector(
4573 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4574}
4575
4576/// Constructs a 256-bit floating-point vector of [4 x double] from a
4577/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4578/// contain the value of the source vector. The upper 128 bits are set
4579/// to zero.
4580///
4581/// \headerfile <x86intrin.h>
4582///
4583/// This intrinsic has no corresponding instruction.
4584///
4585/// \param __a
4586/// A 128-bit vector of [2 x double].
4587/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4588/// contain the value of the parameter. The upper 128 bits are set to zero.
4589static __inline __m256d __DEFAULT_FN_ATTRS
4591{
4592 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4593}
4594
4595/// Constructs a 256-bit floating-point vector of [8 x float] from a
4596/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4597/// the value of the source vector. The upper 128 bits are set to zero.
4598///
4599/// \headerfile <x86intrin.h>
4600///
4601/// This intrinsic has no corresponding instruction.
4602///
4603/// \param __a
4604/// A 128-bit vector of [4 x float].
4605/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4606/// contain the value of the parameter. The upper 128 bits are set to zero.
4607static __inline __m256 __DEFAULT_FN_ATTRS
4609{
4610 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4611}
4612
4613/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4614/// The lower 128 bits contain the value of the source vector. The upper
4615/// 128 bits are set to zero.
4616///
4617/// \headerfile <x86intrin.h>
4618///
4619/// This intrinsic has no corresponding instruction.
4620///
4621/// \param __a
4622/// A 128-bit integer vector.
4623/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4624/// the parameter. The upper 128 bits are set to zero.
4625static __inline __m256i __DEFAULT_FN_ATTRS
4627{
4628 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4629}
4630
4631/*
4632 Vector insert.
4633 We use macros rather than inlines because we only want to accept
4634 invocations where the immediate M is a constant expression.
4635*/
4636/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4637/// a 256-bit vector of [8 x float] given in the first parameter, and then
4638/// replacing either the upper or the lower 128 bits with the contents of a
4639/// 128-bit vector of [4 x float] in the second parameter.
4640///
4641/// The immediate integer parameter determines between the upper or the lower
4642/// 128 bits.
4643///
4644/// \headerfile <x86intrin.h>
4645///
4646/// \code
4647/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4648/// \endcode
4649///
4650/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4651///
4652/// \param V1
4653/// A 256-bit vector of [8 x float]. This vector is copied to the result
4654/// first, and then either the upper or the lower 128 bits of the result will
4655/// be replaced by the contents of \a V2.
4656/// \param V2
4657/// A 128-bit vector of [4 x float]. The contents of this parameter are
4658/// written to either the upper or the lower 128 bits of the result depending
4659/// on the value of parameter \a M.
4660/// \param M
4661/// An immediate integer. The least significant bit determines how the values
4662/// from the two parameters are interleaved: \n
4663/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4664/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4665/// result. \n
4666/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4667/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4668/// result.
4669/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4670#define _mm256_insertf128_ps(V1, V2, M) \
4671 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4672 (__v4sf)(__m128)(V2), (int)(M)))
4673
4674/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4675/// a 256-bit vector of [4 x double] given in the first parameter, and then
4676/// replacing either the upper or the lower 128 bits with the contents of a
4677/// 128-bit vector of [2 x double] in the second parameter.
4678///
4679/// The immediate integer parameter determines between the upper or the lower
4680/// 128 bits.
4681///
4682/// \headerfile <x86intrin.h>
4683///
4684/// \code
4685/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4686/// \endcode
4687///
4688/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4689///
4690/// \param V1
4691/// A 256-bit vector of [4 x double]. This vector is copied to the result
4692/// first, and then either the upper or the lower 128 bits of the result will
4693/// be replaced by the contents of \a V2.
4694/// \param V2
4695/// A 128-bit vector of [2 x double]. The contents of this parameter are
4696/// written to either the upper or the lower 128 bits of the result depending
4697/// on the value of parameter \a M.
4698/// \param M
4699/// An immediate integer. The least significant bit determines how the values
4700/// from the two parameters are interleaved: \n
4701/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4702/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4703/// result. \n
4704/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4705/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4706/// result.
4707/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4708#define _mm256_insertf128_pd(V1, V2, M) \
4709 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4710 (__v2df)(__m128d)(V2), (int)(M)))
4711
4712/// Constructs a new 256-bit integer vector by first duplicating a
4713/// 256-bit integer vector given in the first parameter, and then replacing
4714/// either the upper or the lower 128 bits with the contents of a 128-bit
4715/// integer vector in the second parameter.
4716///
4717/// The immediate integer parameter determines between the upper or the lower
4718/// 128 bits.
4719///
4720/// \headerfile <x86intrin.h>
4721///
4722/// \code
4723/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4724/// \endcode
4725///
4726/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4727///
4728/// \param V1
4729/// A 256-bit integer vector. This vector is copied to the result first, and
4730/// then either the upper or the lower 128 bits of the result will be
4731/// replaced by the contents of \a V2.
4732/// \param V2
4733/// A 128-bit integer vector. The contents of this parameter are written to
4734/// either the upper or the lower 128 bits of the result depending on the
4735/// value of parameter \a M.
4736/// \param M
4737/// An immediate integer. The least significant bit determines how the values
4738/// from the two parameters are interleaved: \n
4739/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4740/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4741/// result. \n
4742/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4743/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4744/// result.
4745/// \returns A 256-bit integer vector containing the interleaved values.
4746#define _mm256_insertf128_si256(V1, V2, M) \
4747 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4748 (__v4si)(__m128i)(V2), (int)(M)))
4749
4750/*
4751 Vector extract.
4752 We use macros rather than inlines because we only want to accept
4753 invocations where the immediate M is a constant expression.
4754*/
4755/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4756/// of [8 x float], as determined by the immediate integer parameter, and
4757/// returns the extracted bits as a 128-bit vector of [4 x float].
4758///
4759/// \headerfile <x86intrin.h>
4760///
4761/// \code
4762/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4763/// \endcode
4764///
4765/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4766///
4767/// \param V
4768/// A 256-bit vector of [8 x float].
4769/// \param M
4770/// An immediate integer. The least significant bit determines which bits are
4771/// extracted from the first parameter: \n
4772/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4773/// result. \n
4774/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4775/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4776#define _mm256_extractf128_ps(V, M) \
4777 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4778
4779/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4780/// of [4 x double], as determined by the immediate integer parameter, and
4781/// returns the extracted bits as a 128-bit vector of [2 x double].
4782///
4783/// \headerfile <x86intrin.h>
4784///
4785/// \code
4786/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4787/// \endcode
4788///
4789/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4790///
4791/// \param V
4792/// A 256-bit vector of [4 x double].
4793/// \param M
4794/// An immediate integer. The least significant bit determines which bits are
4795/// extracted from the first parameter: \n
4796/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4797/// result. \n
4798/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4799/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4800#define _mm256_extractf128_pd(V, M) \
4801 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4802
4803/// Extracts either the upper or the lower 128 bits from a 256-bit
4804/// integer vector, as determined by the immediate integer parameter, and
4805/// returns the extracted bits as a 128-bit integer vector.
4806///
4807/// \headerfile <x86intrin.h>
4808///
4809/// \code
4810/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4811/// \endcode
4812///
4813/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4814///
4815/// \param V
4816/// A 256-bit integer vector.
4817/// \param M
4818/// An immediate integer. The least significant bit determines which bits are
4819/// extracted from the first parameter: \n
4820/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4821/// result. \n
4822/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4823/// \returns A 128-bit integer vector containing the extracted bits.
4824#define _mm256_extractf128_si256(V, M) \
4825 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4826
4827/// Constructs a 256-bit floating-point vector of [8 x float] by
4828/// concatenating two 128-bit floating-point vectors of [4 x float].
4829///
4830/// \headerfile <x86intrin.h>
4831///
4832/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4833///
4834/// \param __hi
4835/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4836/// 128 bits of the result.
4837/// \param __lo
4838/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4839/// 128 bits of the result.
4840/// \returns A 256-bit floating-point vector of [8 x float] containing the
4841/// concatenated result.
4842static __inline __m256 __DEFAULT_FN_ATTRS
4843_mm256_set_m128 (__m128 __hi, __m128 __lo)
4844{
4845 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4846}
4847
4848/// Constructs a 256-bit floating-point vector of [4 x double] by
4849/// concatenating two 128-bit floating-point vectors of [2 x double].
4850///
4851/// \headerfile <x86intrin.h>
4852///
4853/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4854///
4855/// \param __hi
4856/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4857/// 128 bits of the result.
4858/// \param __lo
4859/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4860/// 128 bits of the result.
4861/// \returns A 256-bit floating-point vector of [4 x double] containing the
4862/// concatenated result.
4863static __inline __m256d __DEFAULT_FN_ATTRS
4864_mm256_set_m128d (__m128d __hi, __m128d __lo)
4865{
4866 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4867}
4868
4869/// Constructs a 256-bit integer vector by concatenating two 128-bit
4870/// integer vectors.
4871///
4872/// \headerfile <x86intrin.h>
4873///
4874/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4875///
4876/// \param __hi
4877/// A 128-bit integer vector to be copied to the upper 128 bits of the
4878/// result.
4879/// \param __lo
4880/// A 128-bit integer vector to be copied to the lower 128 bits of the
4881/// result.
4882/// \returns A 256-bit integer vector containing the concatenated result.
4883static __inline __m256i __DEFAULT_FN_ATTRS
4884_mm256_set_m128i (__m128i __hi, __m128i __lo)
4885{
4886 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4887}
4888
4889/// Constructs a 256-bit floating-point vector of [8 x float] by
4890/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4891/// similar to _mm256_set_m128, but the order of the input parameters is
4892/// swapped.
4893///
4894/// \headerfile <x86intrin.h>
4895///
4896/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4897///
4898/// \param __lo
4899/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4900/// 128 bits of the result.
4901/// \param __hi
4902/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4903/// 128 bits of the result.
4904/// \returns A 256-bit floating-point vector of [8 x float] containing the
4905/// concatenated result.
4906static __inline __m256 __DEFAULT_FN_ATTRS
4907_mm256_setr_m128 (__m128 __lo, __m128 __hi)
4908{
4909 return _mm256_set_m128(__hi, __lo);
4910}
4911
4912/// Constructs a 256-bit floating-point vector of [4 x double] by
4913/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4914/// similar to _mm256_set_m128d, but the order of the input parameters is
4915/// swapped.
4916///
4917/// \headerfile <x86intrin.h>
4918///
4919/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4920///
4921/// \param __lo
4922/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4923/// 128 bits of the result.
4924/// \param __hi
4925/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4926/// 128 bits of the result.
4927/// \returns A 256-bit floating-point vector of [4 x double] containing the
4928/// concatenated result.
4929static __inline __m256d __DEFAULT_FN_ATTRS
4930_mm256_setr_m128d (__m128d __lo, __m128d __hi)
4931{
4932 return (__m256d)_mm256_set_m128d(__hi, __lo);
4933}
4934
4935/// Constructs a 256-bit integer vector by concatenating two 128-bit
4936/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4937/// the input parameters is swapped.
4938///
4939/// \headerfile <x86intrin.h>
4940///
4941/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4942///
4943/// \param __lo
4944/// A 128-bit integer vector to be copied to the lower 128 bits of the
4945/// result.
4946/// \param __hi
4947/// A 128-bit integer vector to be copied to the upper 128 bits of the
4948/// result.
4949/// \returns A 256-bit integer vector containing the concatenated result.
4950static __inline __m256i __DEFAULT_FN_ATTRS
4951_mm256_setr_m128i (__m128i __lo, __m128i __hi)
4952{
4953 return (__m256i)_mm256_set_m128i(__hi, __lo);
4954}
4955
4956/* SIMD load ops (unaligned) */
4957/// Loads two 128-bit floating-point vectors of [4 x float] from
4958/// unaligned memory locations and constructs a 256-bit floating-point vector
4959/// of [8 x float] by concatenating the two 128-bit vectors.
4960///
4961/// \headerfile <x86intrin.h>
4962///
4963/// This intrinsic corresponds to load instructions followed by the
4964/// <c> VINSERTF128 </c> instruction.
4965///
4966/// \param __addr_hi
4967/// A pointer to a 128-bit memory location containing 4 consecutive
4968/// single-precision floating-point values. These values are to be copied to
4969/// bits[255:128] of the result. The address of the memory location does not
4970/// have to be aligned.
4971/// \param __addr_lo
4972/// A pointer to a 128-bit memory location containing 4 consecutive
4973/// single-precision floating-point values. These values are to be copied to
4974/// bits[127:0] of the result. The address of the memory location does not
4975/// have to be aligned.
4976/// \returns A 256-bit floating-point vector of [8 x float] containing the
4977/// concatenated result.
4978static __inline __m256 __DEFAULT_FN_ATTRS
4979_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4980{
4981 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4982}
4983
4984/// Loads two 128-bit floating-point vectors of [2 x double] from
4985/// unaligned memory locations and constructs a 256-bit floating-point vector
4986/// of [4 x double] by concatenating the two 128-bit vectors.
4987///
4988/// \headerfile <x86intrin.h>
4989///
4990/// This intrinsic corresponds to load instructions followed by the
4991/// <c> VINSERTF128 </c> instruction.
4992///
4993/// \param __addr_hi
4994/// A pointer to a 128-bit memory location containing two consecutive
4995/// double-precision floating-point values. These values are to be copied to
4996/// bits[255:128] of the result. The address of the memory location does not
4997/// have to be aligned.
4998/// \param __addr_lo
4999/// A pointer to a 128-bit memory location containing two consecutive
5000/// double-precision floating-point values. These values are to be copied to
5001/// bits[127:0] of the result. The address of the memory location does not
5002/// have to be aligned.
5003/// \returns A 256-bit floating-point vector of [4 x double] containing the
5004/// concatenated result.
5005static __inline __m256d __DEFAULT_FN_ATTRS
5006_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
5007{
5008 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
5009}
5010
5011/// Loads two 128-bit integer vectors from unaligned memory locations and
5012/// constructs a 256-bit integer vector by concatenating the two 128-bit
5013/// vectors.
5014///
5015/// \headerfile <x86intrin.h>
5016///
5017/// This intrinsic corresponds to load instructions followed by the
5018/// <c> VINSERTF128 </c> instruction.
5019///
5020/// \param __addr_hi
5021/// A pointer to a 128-bit memory location containing a 128-bit integer
5022/// vector. This vector is to be copied to bits[255:128] of the result. The
5023/// address of the memory location does not have to be aligned.
5024/// \param __addr_lo
5025/// A pointer to a 128-bit memory location containing a 128-bit integer
5026/// vector. This vector is to be copied to bits[127:0] of the result. The
5027/// address of the memory location does not have to be aligned.
5028/// \returns A 256-bit integer vector containing the concatenated result.
5029static __inline __m256i __DEFAULT_FN_ATTRS
5030_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
5031{
5032 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
5033}
5034
5035/* SIMD store ops (unaligned) */
5036/// Stores the upper and lower 128 bits of a 256-bit floating-point
5037/// vector of [8 x float] into two different unaligned memory locations.
5038///
5039/// \headerfile <x86intrin.h>
5040///
5041/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5042/// store instructions.
5043///
5044/// \param __addr_hi
5045/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5046/// copied to this memory location. The address of this memory location does
5047/// not have to be aligned.
5048/// \param __addr_lo
5049/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5050/// copied to this memory location. The address of this memory location does
5051/// not have to be aligned.
5052/// \param __a
5053/// A 256-bit floating-point vector of [8 x float].
5054static __inline void __DEFAULT_FN_ATTRS
5055_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
5056{
5057 __m128 __v128;
5058
5059 __v128 = _mm256_castps256_ps128(__a);
5060 _mm_storeu_ps(__addr_lo, __v128);
5061 __v128 = _mm256_extractf128_ps(__a, 1);
5062 _mm_storeu_ps(__addr_hi, __v128);
5063}
5064
5065/// Stores the upper and lower 128 bits of a 256-bit floating-point
5066/// vector of [4 x double] into two different unaligned memory locations.
5067///
5068/// \headerfile <x86intrin.h>
5069///
5070/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5071/// store instructions.
5072///
5073/// \param __addr_hi
5074/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5075/// copied to this memory location. The address of this memory location does
5076/// not have to be aligned.
5077/// \param __addr_lo
5078/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5079/// copied to this memory location. The address of this memory location does
5080/// not have to be aligned.
5081/// \param __a
5082/// A 256-bit floating-point vector of [4 x double].
5083static __inline void __DEFAULT_FN_ATTRS
5084_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5085{
5086 __m128d __v128;
5087
5088 __v128 = _mm256_castpd256_pd128(__a);
5089 _mm_storeu_pd(__addr_lo, __v128);
5090 __v128 = _mm256_extractf128_pd(__a, 1);
5091 _mm_storeu_pd(__addr_hi, __v128);
5092}
5093
5094/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5095/// two different unaligned memory locations.
5096///
5097/// \headerfile <x86intrin.h>
5098///
5099/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5100/// store instructions.
5101///
5102/// \param __addr_hi
5103/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5104/// copied to this memory location. The address of this memory location does
5105/// not have to be aligned.
5106/// \param __addr_lo
5107/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5108/// copied to this memory location. The address of this memory location does
5109/// not have to be aligned.
5110/// \param __a
5111/// A 256-bit integer vector.
5112static __inline void __DEFAULT_FN_ATTRS
5113_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5114{
5115 __m128i __v128;
5116
5117 __v128 = _mm256_castsi256_si128(__a);
5118 _mm_storeu_si128(__addr_lo, __v128);
5119 __v128 = _mm256_extractf128_si256(__a, 1);
5120 _mm_storeu_si128(__addr_hi, __v128);
5121}
5122
5123#undef __DEFAULT_FN_ATTRS
5124#undef __DEFAULT_FN_ATTRS128
5125
5126#endif /* __AVXINTRIN_H */
__device__ _Float16
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:88
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3058
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4843
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition: avxintrin.h:3102
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition: avxintrin.h:744
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition: avxintrin.h:3308
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2930
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition: avxintrin.h:92
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition: avxintrin.h:3604
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition: avxintrin.h:4439
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition: avxintrin.h:3122
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: avxintrin.h:4164
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition: avxintrin.h:2284
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition: avxintrin.h:3272
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4626
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition: avxintrin.h:3328
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition: avxintrin.h:2471
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition: avxintrin.h:4979
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:356
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3418
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition: avxintrin.h:4235
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition: avxintrin.h:4422
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:656
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition: avxintrin.h:2192
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:390
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition: avxintrin.h:2404
static __inline __m128d __DEFAULT_FN_ATTRS _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition: avxintrin.h:4473
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition: avxintrin.h:2976
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition: avxintrin.h:2208
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition: avxintrin.h:4340
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition: avxintrin.h:3653
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2878
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition: avxintrin.h:4490
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3443
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3394
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition: avxintrin.h:4290
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition: avxintrin.h:982
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition: avxintrin.h:2243
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition: avxintrin.h:3366
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition: avxintrin.h:4776
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition: avxintrin.h:4824
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition: avxintrin.h:3215
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2957
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition: avxintrin.h:1406
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4864
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition: avxintrin.h:4371
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition: avxintrin.h:4216
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition: avxintrin.h:3584
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition: avxintrin.h:891
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition: avxintrin.h:3640
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:373
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: avxintrin.h:4079
#define __DEFAULT_FN_ATTRS
Definition: avxintrin.h:53
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition: avxintrin.h:3516
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4527
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into four signed truncated (rounded toward zero) 32-bit int...
Definition: avxintrin.h:2264
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition: avxintrin.h:3666
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition: avxintrin.h:3999
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition: avxintrin.h:2227
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition: avxintrin.h:4311
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition: avxintrin.h:4456
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2614
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition: avxintrin.h:286
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition: avxintrin.h:3693
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2848
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:3158
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:674
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition: avxintrin.h:4800
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into eight signed truncated (rounded toward zero) 32-bit integers re...
Definition: avxintrin.h:2304
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4549
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition: avxintrin.h:4951
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition: avxintrin.h:128
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition: avxintrin.h:3930
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3080
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition: avxintrin.h:244
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2790
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition: avxintrin.h:5006
static __inline float __DEFAULT_FN_ATTRS _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition: avxintrin.h:2353
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition: avxintrin.h:2379
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition: avxintrin.h:184
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2702
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition: avxintrin.h:5084
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition: avxintrin.h:339
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition: avxintrin.h:147
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition: avxintrin.h:721
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition: avxintrin.h:166
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2731
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition: avxintrin.h:304
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:560
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition: avxintrin.h:3540
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition: avxintrin.h:3492
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:638
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:3175
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2904
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4590
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: avxintrin.h:4197
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition: avxintrin.h:4405
static __inline double __DEFAULT_FN_ATTRS _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition: avxintrin.h:2320
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition: avxintrin.h:698
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition: avxintrin.h:767
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2760
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition: avxintrin.h:2177
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4930
#define __DEFAULT_FN_ATTRS128
Definition: avxintrin.h:56
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition: avxintrin.h:581
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: avxintrin.h:4031
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition: avxintrin.h:1434
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition: avxintrin.h:2449
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition: avxintrin.h:5113
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition: avxintrin.h:110
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition: avxintrin.h:4884
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:620
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2555
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition: avxintrin.h:4326
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition: avxintrin.h:2426
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition: avxintrin.h:3812
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition: avxintrin.h:5030
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition: avxintrin.h:797
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition: avxintrin.h:4353
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition: avxintrin.h:4388
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3467
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition: avxintrin.h:202
static __inline int __DEFAULT_FN_ATTRS _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition: avxintrin.h:2336
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition: avxintrin.h:3625
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4907
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:3231
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition: avxintrin.h:3349
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition: avxintrin.h:265
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition: avxintrin.h:4506
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition: avxintrin.h:3764
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4570
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2498
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition: avxintrin.h:3895
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:3195
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition: avxintrin.h:3732
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3036
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2672
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition: avxintrin.h:2994
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition: avxintrin.h:322
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2525
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition: avxintrin.h:836
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition: avxintrin.h:223
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2643
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition: avxintrin.h:4272
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:3252
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:542
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition: avxintrin.h:3290
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition: avxintrin.h:5055
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition: avxintrin.h:4254
double __v4df __attribute__((__vector_size__(32)))
Definition: avxintrin.h:17
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2584
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2819
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4608
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition: avxintrin.h:74
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition: avxintrin.h:3564
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:3142
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition: avxintrin.h:602
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition: avxintrin.h:3959
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1612
static __inline__ void int __a
Definition: emmintrin.h:4058
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3441
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1860
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1973
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3859
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3890
struct __storeu_i16 *__P __v
Definition: immintrin.h:472
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2033
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:2113
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1870