clang 19.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
55 __min_vector_width__(256)))
56#define __DEFAULT_FN_ATTRS128 \
57 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
58 __min_vector_width__(128)))
59
60/* Arithmetic */
61/// Adds two 256-bit vectors of [4 x double].
62///
63/// \headerfile <x86intrin.h>
64///
65/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
66///
67/// \param __a
68/// A 256-bit vector of [4 x double] containing one of the source operands.
69/// \param __b
70/// A 256-bit vector of [4 x double] containing one of the source operands.
71/// \returns A 256-bit vector of [4 x double] containing the sums of both
72/// operands.
73static __inline __m256d __DEFAULT_FN_ATTRS
74_mm256_add_pd(__m256d __a, __m256d __b)
75{
76 return (__m256d)((__v4df)__a+(__v4df)__b);
77}
78
79/// Adds two 256-bit vectors of [8 x float].
80///
81/// \headerfile <x86intrin.h>
82///
83/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
84///
85/// \param __a
86/// A 256-bit vector of [8 x float] containing one of the source operands.
87/// \param __b
88/// A 256-bit vector of [8 x float] containing one of the source operands.
89/// \returns A 256-bit vector of [8 x float] containing the sums of both
90/// operands.
91static __inline __m256 __DEFAULT_FN_ATTRS
92_mm256_add_ps(__m256 __a, __m256 __b)
93{
94 return (__m256)((__v8sf)__a+(__v8sf)__b);
95}
96
97/// Subtracts two 256-bit vectors of [4 x double].
98///
99/// \headerfile <x86intrin.h>
100///
101/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
102///
103/// \param __a
104/// A 256-bit vector of [4 x double] containing the minuend.
105/// \param __b
106/// A 256-bit vector of [4 x double] containing the subtrahend.
107/// \returns A 256-bit vector of [4 x double] containing the differences between
108/// both operands.
109static __inline __m256d __DEFAULT_FN_ATTRS
110_mm256_sub_pd(__m256d __a, __m256d __b)
111{
112 return (__m256d)((__v4df)__a-(__v4df)__b);
113}
114
115/// Subtracts two 256-bit vectors of [8 x float].
116///
117/// \headerfile <x86intrin.h>
118///
119/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
120///
121/// \param __a
122/// A 256-bit vector of [8 x float] containing the minuend.
123/// \param __b
124/// A 256-bit vector of [8 x float] containing the subtrahend.
125/// \returns A 256-bit vector of [8 x float] containing the differences between
126/// both operands.
127static __inline __m256 __DEFAULT_FN_ATTRS
128_mm256_sub_ps(__m256 __a, __m256 __b)
129{
130 return (__m256)((__v8sf)__a-(__v8sf)__b);
131}
132
133/// Adds the even-indexed values and subtracts the odd-indexed values of
134/// two 256-bit vectors of [4 x double].
135///
136/// \headerfile <x86intrin.h>
137///
138/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
139///
140/// \param __a
141/// A 256-bit vector of [4 x double] containing the left source operand.
142/// \param __b
143/// A 256-bit vector of [4 x double] containing the right source operand.
144/// \returns A 256-bit vector of [4 x double] containing the alternating sums
145/// and differences between both operands.
146static __inline __m256d __DEFAULT_FN_ATTRS
147_mm256_addsub_pd(__m256d __a, __m256d __b)
148{
149 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
150}
151
152/// Adds the even-indexed values and subtracts the odd-indexed values of
153/// two 256-bit vectors of [8 x float].
154///
155/// \headerfile <x86intrin.h>
156///
157/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
158///
159/// \param __a
160/// A 256-bit vector of [8 x float] containing the left source operand.
161/// \param __b
162/// A 256-bit vector of [8 x float] containing the right source operand.
163/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
164/// differences between both operands.
165static __inline __m256 __DEFAULT_FN_ATTRS
166_mm256_addsub_ps(__m256 __a, __m256 __b)
167{
168 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
169}
170
171/// Divides two 256-bit vectors of [4 x double].
172///
173/// \headerfile <x86intrin.h>
174///
175/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
176///
177/// \param __a
178/// A 256-bit vector of [4 x double] containing the dividend.
179/// \param __b
180/// A 256-bit vector of [4 x double] containing the divisor.
181/// \returns A 256-bit vector of [4 x double] containing the quotients of both
182/// operands.
183static __inline __m256d __DEFAULT_FN_ATTRS
184_mm256_div_pd(__m256d __a, __m256d __b)
185{
186 return (__m256d)((__v4df)__a/(__v4df)__b);
187}
188
189/// Divides two 256-bit vectors of [8 x float].
190///
191/// \headerfile <x86intrin.h>
192///
193/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
194///
195/// \param __a
196/// A 256-bit vector of [8 x float] containing the dividend.
197/// \param __b
198/// A 256-bit vector of [8 x float] containing the divisor.
199/// \returns A 256-bit vector of [8 x float] containing the quotients of both
200/// operands.
201static __inline __m256 __DEFAULT_FN_ATTRS
202_mm256_div_ps(__m256 __a, __m256 __b)
203{
204 return (__m256)((__v8sf)__a/(__v8sf)__b);
205}
206
207/// Compares two 256-bit vectors of [4 x double] and returns the greater
208/// of each pair of values.
209///
210/// If either value in a comparison is NaN, returns the value from \a __b.
211///
212/// \headerfile <x86intrin.h>
213///
214/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
215///
216/// \param __a
217/// A 256-bit vector of [4 x double] containing one of the operands.
218/// \param __b
219/// A 256-bit vector of [4 x double] containing one of the operands.
220/// \returns A 256-bit vector of [4 x double] containing the maximum values
221/// between both operands.
222static __inline __m256d __DEFAULT_FN_ATTRS
223_mm256_max_pd(__m256d __a, __m256d __b)
224{
225 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
226}
227
228/// Compares two 256-bit vectors of [8 x float] and returns the greater
229/// of each pair of values.
230///
231/// If either value in a comparison is NaN, returns the value from \a __b.
232///
233/// \headerfile <x86intrin.h>
234///
235/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
236///
237/// \param __a
238/// A 256-bit vector of [8 x float] containing one of the operands.
239/// \param __b
240/// A 256-bit vector of [8 x float] containing one of the operands.
241/// \returns A 256-bit vector of [8 x float] containing the maximum values
242/// between both operands.
243static __inline __m256 __DEFAULT_FN_ATTRS
244_mm256_max_ps(__m256 __a, __m256 __b)
245{
246 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
247}
248
249/// Compares two 256-bit vectors of [4 x double] and returns the lesser
250/// of each pair of values.
251///
252/// If either value in a comparison is NaN, returns the value from \a __b.
253///
254/// \headerfile <x86intrin.h>
255///
256/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
257///
258/// \param __a
259/// A 256-bit vector of [4 x double] containing one of the operands.
260/// \param __b
261/// A 256-bit vector of [4 x double] containing one of the operands.
262/// \returns A 256-bit vector of [4 x double] containing the minimum values
263/// between both operands.
264static __inline __m256d __DEFAULT_FN_ATTRS
265_mm256_min_pd(__m256d __a, __m256d __b)
266{
267 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
268}
269
270/// Compares two 256-bit vectors of [8 x float] and returns the lesser
271/// of each pair of values.
272///
273/// If either value in a comparison is NaN, returns the value from \a __b.
274///
275/// \headerfile <x86intrin.h>
276///
277/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
278///
279/// \param __a
280/// A 256-bit vector of [8 x float] containing one of the operands.
281/// \param __b
282/// A 256-bit vector of [8 x float] containing one of the operands.
283/// \returns A 256-bit vector of [8 x float] containing the minimum values
284/// between both operands.
285static __inline __m256 __DEFAULT_FN_ATTRS
286_mm256_min_ps(__m256 __a, __m256 __b)
287{
288 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
289}
290
291/// Multiplies two 256-bit vectors of [4 x double].
292///
293/// \headerfile <x86intrin.h>
294///
295/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
296///
297/// \param __a
298/// A 256-bit vector of [4 x double] containing one of the operands.
299/// \param __b
300/// A 256-bit vector of [4 x double] containing one of the operands.
301/// \returns A 256-bit vector of [4 x double] containing the products of both
302/// operands.
303static __inline __m256d __DEFAULT_FN_ATTRS
304_mm256_mul_pd(__m256d __a, __m256d __b)
305{
306 return (__m256d)((__v4df)__a * (__v4df)__b);
307}
308
309/// Multiplies two 256-bit vectors of [8 x float].
310///
311/// \headerfile <x86intrin.h>
312///
313/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
314///
315/// \param __a
316/// A 256-bit vector of [8 x float] containing one of the operands.
317/// \param __b
318/// A 256-bit vector of [8 x float] containing one of the operands.
319/// \returns A 256-bit vector of [8 x float] containing the products of both
320/// operands.
321static __inline __m256 __DEFAULT_FN_ATTRS
322_mm256_mul_ps(__m256 __a, __m256 __b)
323{
324 return (__m256)((__v8sf)__a * (__v8sf)__b);
325}
326
327/// Calculates the square roots of the values in a 256-bit vector of
328/// [4 x double].
329///
330/// \headerfile <x86intrin.h>
331///
332/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
333///
334/// \param __a
335/// A 256-bit vector of [4 x double].
336/// \returns A 256-bit vector of [4 x double] containing the square roots of the
337/// values in the operand.
338static __inline __m256d __DEFAULT_FN_ATTRS
340{
341 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
342}
343
344/// Calculates the square roots of the values in a 256-bit vector of
345/// [8 x float].
346///
347/// \headerfile <x86intrin.h>
348///
349/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
350///
351/// \param __a
352/// A 256-bit vector of [8 x float].
353/// \returns A 256-bit vector of [8 x float] containing the square roots of the
354/// values in the operand.
355static __inline __m256 __DEFAULT_FN_ATTRS
357{
358 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
359}
360
361/// Calculates the reciprocal square roots of the values in a 256-bit
362/// vector of [8 x float].
363///
364/// \headerfile <x86intrin.h>
365///
366/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
367///
368/// \param __a
369/// A 256-bit vector of [8 x float].
370/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
371/// roots of the values in the operand.
372static __inline __m256 __DEFAULT_FN_ATTRS
374{
375 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
376}
377
378/// Calculates the reciprocals of the values in a 256-bit vector of
379/// [8 x float].
380///
381/// \headerfile <x86intrin.h>
382///
383/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
384///
385/// \param __a
386/// A 256-bit vector of [8 x float].
387/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
388/// values in the operand.
389static __inline __m256 __DEFAULT_FN_ATTRS
391{
392 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
393}
394
395/// Rounds the values in a 256-bit vector of [4 x double] as specified
396/// by the byte operand. The source values are rounded to integer values and
397/// returned as 64-bit double-precision floating-point values.
398///
399/// \headerfile <x86intrin.h>
400///
401/// \code
402/// __m256d _mm256_round_pd(__m256d V, const int M);
403/// \endcode
404///
405/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
406///
407/// \param V
408/// A 256-bit vector of [4 x double].
409/// \param M
410/// An integer value that specifies the rounding operation. \n
411/// Bits [7:4] are reserved. \n
412/// Bit [3] is a precision exception value: \n
413/// 0: A normal PE exception is used. \n
414/// 1: The PE field is not updated. \n
415/// Bit [2] is the rounding control source: \n
416/// 0: Use bits [1:0] of \a M. \n
417/// 1: Use the current MXCSR setting. \n
418/// Bits [1:0] contain the rounding control definition: \n
419/// 00: Nearest. \n
420/// 01: Downward (toward negative infinity). \n
421/// 10: Upward (toward positive infinity). \n
422/// 11: Truncated.
423/// \returns A 256-bit vector of [4 x double] containing the rounded values.
424#define _mm256_round_pd(V, M) \
425 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
426
427/// Rounds the values stored in a 256-bit vector of [8 x float] as
428/// specified by the byte operand. The source values are rounded to integer
429/// values and returned as floating-point values.
430///
431/// \headerfile <x86intrin.h>
432///
433/// \code
434/// __m256 _mm256_round_ps(__m256 V, const int M);
435/// \endcode
436///
437/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
438///
439/// \param V
440/// A 256-bit vector of [8 x float].
441/// \param M
442/// An integer value that specifies the rounding operation. \n
443/// Bits [7:4] are reserved. \n
444/// Bit [3] is a precision exception value: \n
445/// 0: A normal PE exception is used. \n
446/// 1: The PE field is not updated. \n
447/// Bit [2] is the rounding control source: \n
448/// 0: Use bits [1:0] of \a M. \n
449/// 1: Use the current MXCSR setting. \n
450/// Bits [1:0] contain the rounding control definition: \n
451/// 00: Nearest. \n
452/// 01: Downward (toward negative infinity). \n
453/// 10: Upward (toward positive infinity). \n
454/// 11: Truncated.
455/// \returns A 256-bit vector of [8 x float] containing the rounded values.
456#define _mm256_round_ps(V, M) \
457 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
458
459/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
460/// source values are rounded up to integer values and returned as 64-bit
461/// double-precision floating-point values.
462///
463/// \headerfile <x86intrin.h>
464///
465/// \code
466/// __m256d _mm256_ceil_pd(__m256d V);
467/// \endcode
468///
469/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
470///
471/// \param V
472/// A 256-bit vector of [4 x double].
473/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
474#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
475
476/// Rounds down the values stored in a 256-bit vector of [4 x double].
477/// The source values are rounded down to integer values and returned as
478/// 64-bit double-precision floating-point values.
479///
480/// \headerfile <x86intrin.h>
481///
482/// \code
483/// __m256d _mm256_floor_pd(__m256d V);
484/// \endcode
485///
486/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
487///
488/// \param V
489/// A 256-bit vector of [4 x double].
490/// \returns A 256-bit vector of [4 x double] containing the rounded down
491/// values.
492#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
493
494/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
495/// source values are rounded up to integer values and returned as
496/// floating-point values.
497///
498/// \headerfile <x86intrin.h>
499///
500/// \code
501/// __m256 _mm256_ceil_ps(__m256 V);
502/// \endcode
503///
504/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
505///
506/// \param V
507/// A 256-bit vector of [8 x float].
508/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
509#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
510
511/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
512/// source values are rounded down to integer values and returned as
513/// floating-point values.
514///
515/// \headerfile <x86intrin.h>
516///
517/// \code
518/// __m256 _mm256_floor_ps(__m256 V);
519/// \endcode
520///
521/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
522///
523/// \param V
524/// A 256-bit vector of [8 x float].
525/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
526#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
527
528/* Logical */
529/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
530///
531/// \headerfile <x86intrin.h>
532///
533/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
534///
535/// \param __a
536/// A 256-bit vector of [4 x double] containing one of the source operands.
537/// \param __b
538/// A 256-bit vector of [4 x double] containing one of the source operands.
539/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
540/// values between both operands.
541static __inline __m256d __DEFAULT_FN_ATTRS
542_mm256_and_pd(__m256d __a, __m256d __b)
543{
544 return (__m256d)((__v4du)__a & (__v4du)__b);
545}
546
547/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
548///
549/// \headerfile <x86intrin.h>
550///
551/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
552///
553/// \param __a
554/// A 256-bit vector of [8 x float] containing one of the source operands.
555/// \param __b
556/// A 256-bit vector of [8 x float] containing one of the source operands.
557/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
558/// values between both operands.
559static __inline __m256 __DEFAULT_FN_ATTRS
560_mm256_and_ps(__m256 __a, __m256 __b)
561{
562 return (__m256)((__v8su)__a & (__v8su)__b);
563}
564
565/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
566/// the one's complement of the values contained in the first source operand.
567///
568/// \headerfile <x86intrin.h>
569///
570/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
571///
572/// \param __a
573/// A 256-bit vector of [4 x double] containing the left source operand. The
574/// one's complement of this value is used in the bitwise AND.
575/// \param __b
576/// A 256-bit vector of [4 x double] containing the right source operand.
577/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
578/// values of the second operand and the one's complement of the first
579/// operand.
580static __inline __m256d __DEFAULT_FN_ATTRS
581_mm256_andnot_pd(__m256d __a, __m256d __b)
582{
583 return (__m256d)(~(__v4du)__a & (__v4du)__b);
584}
585
586/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
587/// the one's complement of the values contained in the first source operand.
588///
589/// \headerfile <x86intrin.h>
590///
591/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
592///
593/// \param __a
594/// A 256-bit vector of [8 x float] containing the left source operand. The
595/// one's complement of this value is used in the bitwise AND.
596/// \param __b
597/// A 256-bit vector of [8 x float] containing the right source operand.
598/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
599/// values of the second operand and the one's complement of the first
600/// operand.
601static __inline __m256 __DEFAULT_FN_ATTRS
602_mm256_andnot_ps(__m256 __a, __m256 __b)
603{
604 return (__m256)(~(__v8su)__a & (__v8su)__b);
605}
606
607/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
608///
609/// \headerfile <x86intrin.h>
610///
611/// This intrinsic corresponds to the <c> VORPD </c> instruction.
612///
613/// \param __a
614/// A 256-bit vector of [4 x double] containing one of the source operands.
615/// \param __b
616/// A 256-bit vector of [4 x double] containing one of the source operands.
617/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
618/// values between both operands.
619static __inline __m256d __DEFAULT_FN_ATTRS
620_mm256_or_pd(__m256d __a, __m256d __b)
621{
622 return (__m256d)((__v4du)__a | (__v4du)__b);
623}
624
625/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
626///
627/// \headerfile <x86intrin.h>
628///
629/// This intrinsic corresponds to the <c> VORPS </c> instruction.
630///
631/// \param __a
632/// A 256-bit vector of [8 x float] containing one of the source operands.
633/// \param __b
634/// A 256-bit vector of [8 x float] containing one of the source operands.
635/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
636/// values between both operands.
637static __inline __m256 __DEFAULT_FN_ATTRS
638_mm256_or_ps(__m256 __a, __m256 __b)
639{
640 return (__m256)((__v8su)__a | (__v8su)__b);
641}
642
643/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
644///
645/// \headerfile <x86intrin.h>
646///
647/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
648///
649/// \param __a
650/// A 256-bit vector of [4 x double] containing one of the source operands.
651/// \param __b
652/// A 256-bit vector of [4 x double] containing one of the source operands.
653/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
654/// values between both operands.
655static __inline __m256d __DEFAULT_FN_ATTRS
656_mm256_xor_pd(__m256d __a, __m256d __b)
657{
658 return (__m256d)((__v4du)__a ^ (__v4du)__b);
659}
660
661/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
662///
663/// \headerfile <x86intrin.h>
664///
665/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
666///
667/// \param __a
668/// A 256-bit vector of [8 x float] containing one of the source operands.
669/// \param __b
670/// A 256-bit vector of [8 x float] containing one of the source operands.
671/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
672/// values between both operands.
673static __inline __m256 __DEFAULT_FN_ATTRS
674_mm256_xor_ps(__m256 __a, __m256 __b)
675{
676 return (__m256)((__v8su)__a ^ (__v8su)__b);
677}
678
679/* Horizontal arithmetic */
680/// Horizontally adds the adjacent pairs of values contained in two
681/// 256-bit vectors of [4 x double].
682///
683/// \headerfile <x86intrin.h>
684///
685/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
686///
687/// \param __a
688/// A 256-bit vector of [4 x double] containing one of the source operands.
689/// The horizontal sums of the values are returned in the even-indexed
690/// elements of a vector of [4 x double].
691/// \param __b
692/// A 256-bit vector of [4 x double] containing one of the source operands.
693/// The horizontal sums of the values are returned in the odd-indexed
694/// elements of a vector of [4 x double].
695/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
696/// both operands.
697static __inline __m256d __DEFAULT_FN_ATTRS
698_mm256_hadd_pd(__m256d __a, __m256d __b)
699{
700 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
701}
702
703/// Horizontally adds the adjacent pairs of values contained in two
704/// 256-bit vectors of [8 x float].
705///
706/// \headerfile <x86intrin.h>
707///
708/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
709///
710/// \param __a
711/// A 256-bit vector of [8 x float] containing one of the source operands.
712/// The horizontal sums of the values are returned in the elements with
713/// index 0, 1, 4, 5 of a vector of [8 x float].
714/// \param __b
715/// A 256-bit vector of [8 x float] containing one of the source operands.
716/// The horizontal sums of the values are returned in the elements with
717/// index 2, 3, 6, 7 of a vector of [8 x float].
718/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
719/// both operands.
720static __inline __m256 __DEFAULT_FN_ATTRS
721_mm256_hadd_ps(__m256 __a, __m256 __b)
722{
723 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
724}
725
726/// Horizontally subtracts the adjacent pairs of values contained in two
727/// 256-bit vectors of [4 x double].
728///
729/// \headerfile <x86intrin.h>
730///
731/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
732///
733/// \param __a
734/// A 256-bit vector of [4 x double] containing one of the source operands.
735/// The horizontal differences between the values are returned in the
736/// even-indexed elements of a vector of [4 x double].
737/// \param __b
738/// A 256-bit vector of [4 x double] containing one of the source operands.
739/// The horizontal differences between the values are returned in the
740/// odd-indexed elements of a vector of [4 x double].
741/// \returns A 256-bit vector of [4 x double] containing the horizontal
742/// differences of both operands.
743static __inline __m256d __DEFAULT_FN_ATTRS
744_mm256_hsub_pd(__m256d __a, __m256d __b)
745{
746 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
747}
748
749/// Horizontally subtracts the adjacent pairs of values contained in two
750/// 256-bit vectors of [8 x float].
751///
752/// \headerfile <x86intrin.h>
753///
754/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
755///
756/// \param __a
757/// A 256-bit vector of [8 x float] containing one of the source operands.
758/// The horizontal differences between the values are returned in the
759/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
760/// \param __b
761/// A 256-bit vector of [8 x float] containing one of the source operands.
762/// The horizontal differences between the values are returned in the
763/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
764/// \returns A 256-bit vector of [8 x float] containing the horizontal
765/// differences of both operands.
766static __inline __m256 __DEFAULT_FN_ATTRS
767_mm256_hsub_ps(__m256 __a, __m256 __b)
768{
769 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
770}
771
772/* Vector permutations */
773/// Copies the values in a 128-bit vector of [2 x double] as specified
774/// by the 128-bit integer vector operand.
775///
776/// \headerfile <x86intrin.h>
777///
778/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
779///
780/// \param __a
781/// A 128-bit vector of [2 x double].
782/// \param __c
783/// A 128-bit integer vector operand specifying how the values are to be
784/// copied. \n
785/// Bit [1]: \n
786/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
787/// vector. \n
788/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
789/// returned vector. \n
790/// Bit [65]: \n
791/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
792/// returned vector. \n
793/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
794/// returned vector.
795/// \returns A 128-bit vector of [2 x double] containing the copied values.
796static __inline __m128d __DEFAULT_FN_ATTRS128
797_mm_permutevar_pd(__m128d __a, __m128i __c)
798{
799 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
800}
801
802/// Copies the values in a 256-bit vector of [4 x double] as specified
803/// by the 256-bit integer vector operand.
804///
805/// \headerfile <x86intrin.h>
806///
807/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
808///
809/// \param __a
810/// A 256-bit vector of [4 x double].
811/// \param __c
812/// A 256-bit integer vector operand specifying how the values are to be
813/// copied. \n
814/// Bit [1]: \n
815/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
816/// vector. \n
817/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
818/// returned vector. \n
819/// Bit [65]: \n
820/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
821/// returned vector. \n
822/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
823/// returned vector. \n
824/// Bit [129]: \n
825/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
826/// returned vector. \n
827/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
828/// returned vector. \n
829/// Bit [193]: \n
830/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
831/// returned vector. \n
832/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
833/// returned vector.
834/// \returns A 256-bit vector of [4 x double] containing the copied values.
835static __inline __m256d __DEFAULT_FN_ATTRS
836_mm256_permutevar_pd(__m256d __a, __m256i __c)
837{
838 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
839}
840
841/// Copies the values stored in a 128-bit vector of [4 x float] as
842/// specified by the 128-bit integer vector operand.
843/// \headerfile <x86intrin.h>
844///
845/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
846///
847/// \param __a
848/// A 128-bit vector of [4 x float].
849/// \param __c
850/// A 128-bit integer vector operand specifying how the values are to be
851/// copied. \n
852/// Bits [1:0]: \n
853/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
854/// returned vector. \n
855/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
856/// returned vector. \n
857/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
858/// returned vector. \n
859/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
860/// returned vector. \n
861/// Bits [33:32]: \n
862/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
863/// returned vector. \n
864/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
865/// returned vector. \n
866/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
867/// returned vector. \n
868/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
869/// returned vector. \n
870/// Bits [65:64]: \n
871/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
872/// returned vector. \n
873/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
874/// returned vector. \n
875/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
876/// returned vector. \n
877/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
878/// returned vector. \n
879/// Bits [97:96]: \n
880/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
881/// returned vector. \n
882/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
883/// returned vector. \n
884/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
885/// returned vector. \n
886/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
887/// returned vector.
888/// \returns A 128-bit vector of [4 x float] containing the copied values.
889static __inline __m128 __DEFAULT_FN_ATTRS128
890_mm_permutevar_ps(__m128 __a, __m128i __c)
891{
892 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
893}
894
895/// Copies the values stored in a 256-bit vector of [8 x float] as
896/// specified by the 256-bit integer vector operand.
897///
898/// \headerfile <x86intrin.h>
899///
900/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
901///
902/// \param __a
903/// A 256-bit vector of [8 x float].
904/// \param __c
905/// A 256-bit integer vector operand specifying how the values are to be
906/// copied. \n
907/// Bits [1:0]: \n
908/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
909/// returned vector. \n
910/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
911/// returned vector. \n
912/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
913/// returned vector. \n
914/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
915/// returned vector. \n
916/// Bits [33:32]: \n
917/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
918/// returned vector. \n
919/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
920/// returned vector. \n
921/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
922/// returned vector. \n
923/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
924/// returned vector. \n
925/// Bits [65:64]: \n
926/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
927/// returned vector. \n
928/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
929/// returned vector. \n
930/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
931/// returned vector. \n
932/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
933/// returned vector. \n
934/// Bits [97:96]: \n
935/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
936/// returned vector. \n
937/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
938/// returned vector. \n
939/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
940/// returned vector. \n
941/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
942/// returned vector. \n
943/// Bits [129:128]: \n
944/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
945/// returned vector. \n
946/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
947/// returned vector. \n
948/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
949/// returned vector. \n
950/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
951/// returned vector. \n
952/// Bits [161:160]: \n
953/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
954/// returned vector. \n
955/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
956/// returned vector. \n
957/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
958/// returned vector. \n
959/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
960/// returned vector. \n
961/// Bits [193:192]: \n
962/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
963/// returned vector. \n
964/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
965/// returned vector. \n
966/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
967/// returned vector. \n
968/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
969/// returned vector. \n
970/// Bits [225:224]: \n
971/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
972/// returned vector. \n
973/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
974/// returned vector. \n
975/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
976/// returned vector. \n
977/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
978/// returned vector.
979/// \returns A 256-bit vector of [8 x float] containing the copied values.
980static __inline __m256 __DEFAULT_FN_ATTRS
982{
983 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
984}
985
986/// Copies the values in a 128-bit vector of [2 x double] as specified
987/// by the immediate integer operand.
988///
989/// \headerfile <x86intrin.h>
990///
991/// \code
992/// __m128d _mm_permute_pd(__m128d A, const int C);
993/// \endcode
994///
995/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
996///
997/// \param A
998/// A 128-bit vector of [2 x double].
999/// \param C
1000/// An immediate integer operand specifying how the values are to be
1001/// copied. \n
1002/// Bit [0]: \n
1003/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1004/// vector. \n
1005/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1006/// returned vector. \n
1007/// Bit [1]: \n
1008/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1009/// returned vector. \n
1010/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1011/// returned vector.
1012/// \returns A 128-bit vector of [2 x double] containing the copied values.
1013#define _mm_permute_pd(A, C) \
1014 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1015
1016/// Copies the values in a 256-bit vector of [4 x double] as specified by
1017/// the immediate integer operand.
1018///
1019/// \headerfile <x86intrin.h>
1020///
1021/// \code
1022/// __m256d _mm256_permute_pd(__m256d A, const int C);
1023/// \endcode
1024///
1025/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1026///
1027/// \param A
1028/// A 256-bit vector of [4 x double].
1029/// \param C
1030/// An immediate integer operand specifying how the values are to be
1031/// copied. \n
1032/// Bit [0]: \n
1033/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1034/// vector. \n
1035/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1036/// returned vector. \n
1037/// Bit [1]: \n
1038/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1039/// returned vector. \n
1040/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1041/// returned vector. \n
1042/// Bit [2]: \n
1043/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1044/// returned vector. \n
1045/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1046/// returned vector. \n
1047/// Bit [3]: \n
1048/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1049/// returned vector. \n
1050/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1051/// returned vector.
1052/// \returns A 256-bit vector of [4 x double] containing the copied values.
1053#define _mm256_permute_pd(A, C) \
1054 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1055
1056/// Copies the values in a 128-bit vector of [4 x float] as specified by
1057/// the immediate integer operand.
1058///
1059/// \headerfile <x86intrin.h>
1060///
1061/// \code
1062/// __m128 _mm_permute_ps(__m128 A, const int C);
1063/// \endcode
1064///
1065/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1066///
1067/// \param A
1068/// A 128-bit vector of [4 x float].
1069/// \param C
1070/// An immediate integer operand specifying how the values are to be
1071/// copied. \n
1072/// Bits [1:0]: \n
1073/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1074/// returned vector. \n
1075/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1076/// returned vector. \n
1077/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1078/// returned vector. \n
1079/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1080/// returned vector. \n
1081/// Bits [3:2]: \n
1082/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1083/// returned vector. \n
1084/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1085/// returned vector. \n
1086/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1087/// returned vector. \n
1088/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1089/// returned vector. \n
1090/// Bits [5:4]: \n
1091/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1092/// returned vector. \n
1093/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1094/// returned vector. \n
1095/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1096/// returned vector. \n
1097/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1098/// returned vector. \n
1099/// Bits [7:6]: \n
1100/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1101/// returned vector. \n
1102/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1103/// returned vector. \n
1104/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1105/// returned vector. \n
1106/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1107/// returned vector.
1108/// \returns A 128-bit vector of [4 x float] containing the copied values.
1109#define _mm_permute_ps(A, C) \
1110 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1111
1112/// Copies the values in a 256-bit vector of [8 x float] as specified by
1113/// the immediate integer operand.
1114///
1115/// \headerfile <x86intrin.h>
1116///
1117/// \code
1118/// __m256 _mm256_permute_ps(__m256 A, const int C);
1119/// \endcode
1120///
1121/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1122///
1123/// \param A
1124/// A 256-bit vector of [8 x float].
1125/// \param C
1126/// An immediate integer operand specifying how the values are to be
1127/// copied. \n
1128/// Bits [1:0]: \n
1129/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1130/// returned vector. \n
1131/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1132/// returned vector. \n
1133/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1134/// returned vector. \n
1135/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1136/// returned vector. \n
1137/// Bits [3:2]: \n
1138/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1139/// returned vector. \n
1140/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1141/// returned vector. \n
1142/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1143/// returned vector. \n
1144/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1145/// returned vector. \n
1146/// Bits [5:4]: \n
1147/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1148/// returned vector. \n
1149/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1150/// returned vector. \n
1151/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1152/// returned vector. \n
1153/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1154/// returned vector. \n
1155/// Bits [7:6]: \n
1156/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1157/// returned vector. \n
1158/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1159/// returned vector. \n
1160/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1161/// returned vector. \n
1162/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1163/// returned vector. \n
1164/// Bits [1:0]: \n
1165/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1166/// returned vector. \n
1167/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1168/// returned vector. \n
1169/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1170/// returned vector. \n
1171/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1172/// returned vector. \n
1173/// Bits [3:2]: \n
1174/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1175/// returned vector. \n
1176/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1177/// returned vector. \n
1178/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1179/// returned vector. \n
1180/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1181/// returned vector. \n
1182/// Bits [5:4]: \n
1183/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1184/// returned vector. \n
1185/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1186/// returned vector. \n
1187/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1188/// returned vector. \n
1189/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1190/// returned vector. \n
1191/// Bits [7:6]: \n
1192/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1193/// returned vector. \n
1194/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1195/// returned vector. \n
1196/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1197/// returned vector. \n
1198/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1199/// returned vector.
1200/// \returns A 256-bit vector of [8 x float] containing the copied values.
1201#define _mm256_permute_ps(A, C) \
1202 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1203
1204/// Permutes 128-bit data values stored in two 256-bit vectors of
1205/// [4 x double], as specified by the immediate integer operand.
1206///
1207/// \headerfile <x86intrin.h>
1208///
1209/// \code
1210/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1211/// \endcode
1212///
1213/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1214///
1215/// \param V1
1216/// A 256-bit vector of [4 x double].
1217/// \param V2
1218/// A 256-bit vector of [4 x double.
1219/// \param M
1220/// An immediate integer operand specifying how the values are to be
1221/// permuted. \n
1222/// Bits [1:0]: \n
1223/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1224/// destination. \n
1225/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1226/// destination. \n
1227/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1228/// destination. \n
1229/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1230/// destination. \n
1231/// Bits [5:4]: \n
1232/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1233/// destination. \n
1234/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1235/// destination. \n
1236/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1237/// destination. \n
1238/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1239/// destination.
1240/// \returns A 256-bit vector of [4 x double] containing the copied values.
1241#define _mm256_permute2f128_pd(V1, V2, M) \
1242 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1243 (__v4df)(__m256d)(V2), (int)(M)))
1244
1245/// Permutes 128-bit data values stored in two 256-bit vectors of
1246/// [8 x float], as specified by the immediate integer operand.
1247///
1248/// \headerfile <x86intrin.h>
1249///
1250/// \code
1251/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1252/// \endcode
1253///
1254/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1255///
1256/// \param V1
1257/// A 256-bit vector of [8 x float].
1258/// \param V2
1259/// A 256-bit vector of [8 x float].
1260/// \param M
1261/// An immediate integer operand specifying how the values are to be
1262/// permuted. \n
1263/// Bits [1:0]: \n
1264/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1265/// destination. \n
1266/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1267/// destination. \n
1268/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1269/// destination. \n
1270/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1271/// destination. \n
1272/// Bits [5:4]: \n
1273/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1274/// destination. \n
1275/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1276/// destination. \n
1277/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1278/// destination. \n
1279/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1280/// destination.
1281/// \returns A 256-bit vector of [8 x float] containing the copied values.
1282#define _mm256_permute2f128_ps(V1, V2, M) \
1283 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1284 (__v8sf)(__m256)(V2), (int)(M)))
1285
1286/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1287/// as specified by the immediate integer operand.
1288///
1289/// \headerfile <x86intrin.h>
1290///
1291/// \code
1292/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1293/// \endcode
1294///
1295/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1296///
1297/// \param V1
1298/// A 256-bit integer vector.
1299/// \param V2
1300/// A 256-bit integer vector.
1301/// \param M
1302/// An immediate integer operand specifying how the values are to be copied.
1303/// Bits [1:0]: \n
1304/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1305/// destination. \n
1306/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1307/// destination. \n
1308/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1309/// destination. \n
1310/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1311/// destination. \n
1312/// Bits [5:4]: \n
1313/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1314/// destination. \n
1315/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1316/// destination. \n
1317/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1318/// destination. \n
1319/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1320/// destination.
1321/// \returns A 256-bit integer vector containing the copied values.
1322#define _mm256_permute2f128_si256(V1, V2, M) \
1323 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1324 (__v8si)(__m256i)(V2), (int)(M)))
1325
1326/* Vector Blend */
1327/// Merges 64-bit double-precision data values stored in either of the
1328/// two 256-bit vectors of [4 x double], as specified by the immediate
1329/// integer operand.
1330///
1331/// \headerfile <x86intrin.h>
1332///
1333/// \code
1334/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1335/// \endcode
1336///
1337/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1338///
1339/// \param V1
1340/// A 256-bit vector of [4 x double].
1341/// \param V2
1342/// A 256-bit vector of [4 x double].
1343/// \param M
1344/// An immediate integer operand, with mask bits [3:0] specifying how the
1345/// values are to be copied. The position of the mask bit corresponds to the
1346/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1347/// element in operand \a V1 is copied to the same position in the
1348/// destination. When a mask bit is 1, the corresponding 64-bit element in
1349/// operand \a V2 is copied to the same position in the destination.
1350/// \returns A 256-bit vector of [4 x double] containing the copied values.
1351#define _mm256_blend_pd(V1, V2, M) \
1352 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1353 (__v4df)(__m256d)(V2), (int)(M)))
1354
1355/// Merges 32-bit single-precision data values stored in either of the
1356/// two 256-bit vectors of [8 x float], as specified by the immediate
1357/// integer operand.
1358///
1359/// \headerfile <x86intrin.h>
1360///
1361/// \code
1362/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1363/// \endcode
1364///
1365/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1366///
1367/// \param V1
1368/// A 256-bit vector of [8 x float].
1369/// \param V2
1370/// A 256-bit vector of [8 x float].
1371/// \param M
1372/// An immediate integer operand, with mask bits [7:0] specifying how the
1373/// values are to be copied. The position of the mask bit corresponds to the
1374/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1375/// element in operand \a V1 is copied to the same position in the
1376/// destination. When a mask bit is 1, the corresponding 32-bit element in
1377/// operand \a V2 is copied to the same position in the destination.
1378/// \returns A 256-bit vector of [8 x float] containing the copied values.
1379#define _mm256_blend_ps(V1, V2, M) \
1380 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1381 (__v8sf)(__m256)(V2), (int)(M)))
1382
1383/// Merges 64-bit double-precision data values stored in either of the
1384/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1385/// operand.
1386///
1387/// \headerfile <x86intrin.h>
1388///
1389/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1390///
1391/// \param __a
1392/// A 256-bit vector of [4 x double].
1393/// \param __b
1394/// A 256-bit vector of [4 x double].
1395/// \param __c
1396/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1397/// how the values are to be copied. The position of the mask bit corresponds
1398/// to the most significant bit of a copied value. When a mask bit is 0, the
1399/// corresponding 64-bit element in operand \a __a is copied to the same
1400/// position in the destination. When a mask bit is 1, the corresponding
1401/// 64-bit element in operand \a __b is copied to the same position in the
1402/// destination.
1403/// \returns A 256-bit vector of [4 x double] containing the copied values.
1404static __inline __m256d __DEFAULT_FN_ATTRS
1405_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1406{
1407 return (__m256d)__builtin_ia32_blendvpd256(
1408 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1409}
1410
1411/// Merges 32-bit single-precision data values stored in either of the
1412/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1413/// operand.
1414///
1415/// \headerfile <x86intrin.h>
1416///
1417/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1418///
1419/// \param __a
1420/// A 256-bit vector of [8 x float].
1421/// \param __b
1422/// A 256-bit vector of [8 x float].
1423/// \param __c
1424/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1425/// and 31 specifying how the values are to be copied. The position of the
1426/// mask bit corresponds to the most significant bit of a copied value. When
1427/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1428/// copied to the same position in the destination. When a mask bit is 1, the
1429/// corresponding 32-bit element in operand \a __b is copied to the same
1430/// position in the destination.
1431/// \returns A 256-bit vector of [8 x float] containing the copied values.
1432static __inline __m256 __DEFAULT_FN_ATTRS
1433_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1434{
1435 return (__m256)__builtin_ia32_blendvps256(
1436 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1437}
1438
1439/* Vector Dot Product */
1440/// Computes two dot products in parallel, using the lower and upper
1441/// halves of two [8 x float] vectors as input to the two computations, and
1442/// returning the two dot products in the lower and upper halves of the
1443/// [8 x float] result.
1444///
1445/// The immediate integer operand controls which input elements will
1446/// contribute to the dot product, and where the final results are returned.
1447/// In general, for each dot product, the four corresponding elements of the
1448/// input vectors are multiplied; the first two and second two products are
1449/// summed, then the two sums are added to form the final result.
1450///
1451/// \headerfile <x86intrin.h>
1452///
1453/// \code
1454/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1455/// \endcode
1456///
1457/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1458///
1459/// \param V1
1460/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1461/// \param V2
1462/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1463/// \param M
1464/// An immediate integer argument. Bits [7:4] determine which elements of
1465/// the input vectors are used, with bit [4] corresponding to the lowest
1466/// element and bit [7] corresponding to the highest element of each [4 x
1467/// float] subvector. If a bit is set, the corresponding elements from the
1468/// two input vectors are used as an input for dot product; otherwise that
1469/// input is treated as zero. Bits [3:0] determine which elements of the
1470/// result will receive a copy of the final dot product, with bit [0]
1471/// corresponding to the lowest element and bit [3] corresponding to the
1472/// highest element of each [4 x float] subvector. If a bit is set, the dot
1473/// product is returned in the corresponding element; otherwise that element
1474/// is set to zero. The bitmask is applied in the same way to each of the
1475/// two parallel dot product computations.
1476/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1477#define _mm256_dp_ps(V1, V2, M) \
1478 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1479 (__v8sf)(__m256)(V2), (M)))
1480
1481/* Vector shuffle */
1482/// Selects 8 float values from the 256-bit operands of [8 x float], as
1483/// specified by the immediate value operand.
1484///
1485/// The four selected elements in each operand are copied to the destination
1486/// according to the bits specified in the immediate operand. The selected
1487/// elements from the first 256-bit operand are copied to bits [63:0] and
1488/// bits [191:128] of the destination, and the selected elements from the
1489/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1490/// the destination. For example, if bits [7:0] of the immediate operand
1491/// contain a value of 0xFF, the 256-bit destination vector would contain the
1492/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1493///
1494/// \headerfile <x86intrin.h>
1495///
1496/// \code
1497/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1498/// \endcode
1499///
1500/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1501///
1502/// \param a
1503/// A 256-bit vector of [8 x float]. The four selected elements in this
1504/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1505/// according to the bits specified in the immediate operand.
1506/// \param b
1507/// A 256-bit vector of [8 x float]. The four selected elements in this
1508/// operand are copied to bits [127:64] and bits [255:192] in the
1509/// destination, according to the bits specified in the immediate operand.
1510/// \param mask
1511/// An immediate value containing an 8-bit value specifying which elements to
1512/// copy from \a a and \a b \n.
1513/// Bits [3:0] specify the values copied from operand \a a. \n
1514/// Bits [7:4] specify the values copied from operand \a b. \n
1515/// The destinations within the 256-bit destination are assigned values as
1516/// follows, according to the bit value assignments described below: \n
1517/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1518/// destination. \n
1519/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1520/// destination. \n
1521/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1522/// destination. \n
1523/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1524/// the destination. \n
1525/// Bit value assignments: \n
1526/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1527/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1528/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1529/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1530/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1531/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1532/// <c>[b6, b4, b2, b0]</c>.
1533/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1534#define _mm256_shuffle_ps(a, b, mask) \
1535 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1536 (__v8sf)(__m256)(b), (int)(mask)))
1537
1538/// Selects four double-precision values from the 256-bit operands of
1539/// [4 x double], as specified by the immediate value operand.
1540///
1541/// The selected elements from the first 256-bit operand are copied to bits
1542/// [63:0] and bits [191:128] in the destination, and the selected elements
1543/// from the second 256-bit operand are copied to bits [127:64] and bits
1544/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1545/// operand contain a value of 0xF, the 256-bit destination vector would
1546/// contain the following values: b[3], a[3], b[1], a[1].
1547///
1548/// \headerfile <x86intrin.h>
1549///
1550/// \code
1551/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1552/// \endcode
1553///
1554/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1555///
1556/// \param a
1557/// A 256-bit vector of [4 x double].
1558/// \param b
1559/// A 256-bit vector of [4 x double].
1560/// \param mask
1561/// An immediate value containing 8-bit values specifying which elements to
1562/// copy from \a a and \a b: \n
1563/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1564/// destination. \n
1565/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1566/// destination. \n
1567/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1568/// destination. \n
1569/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1570/// destination. \n
1571/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1572/// destination. \n
1573/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1574/// destination. \n
1575/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1576/// destination. \n
1577/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1578/// destination.
1579/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1580#define _mm256_shuffle_pd(a, b, mask) \
1581 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1582 (__v4df)(__m256d)(b), (int)(mask)))
1583
1584/* Compare */
1585#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1586#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1587#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1588#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1589#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1590#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1591#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1592#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1593#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1594#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1595#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1596#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1597#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1598#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1599#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1600#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1601#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1602#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1603#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1604#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1605#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1606#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1607#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1608#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1609
1610/* Below intrinsic defined in emmintrin.h can be used for AVX */
1611/// Compares each of the corresponding double-precision values of two
1612/// 128-bit vectors of [2 x double], using the operation specified by the
1613/// immediate integer operand.
1614///
1615/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1616/// If either value in a comparison is NaN, comparisons that are ordered
1617/// return false, and comparisons that are unordered return true.
1618///
1619/// \headerfile <x86intrin.h>
1620///
1621/// \code
1622/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1623/// \endcode
1624///
1625/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1626///
1627/// \param a
1628/// A 128-bit vector of [2 x double].
1629/// \param b
1630/// A 128-bit vector of [2 x double].
1631/// \param c
1632/// An immediate integer operand, with bits [4:0] specifying which comparison
1633/// operation to use: \n
1634/// 0x00: Equal (ordered, non-signaling) \n
1635/// 0x01: Less-than (ordered, signaling) \n
1636/// 0x02: Less-than-or-equal (ordered, signaling) \n
1637/// 0x03: Unordered (non-signaling) \n
1638/// 0x04: Not-equal (unordered, non-signaling) \n
1639/// 0x05: Not-less-than (unordered, signaling) \n
1640/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1641/// 0x07: Ordered (non-signaling) \n
1642/// 0x08: Equal (unordered, non-signaling) \n
1643/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1644/// 0x0A: Not-greater-than (unordered, signaling) \n
1645/// 0x0B: False (ordered, non-signaling) \n
1646/// 0x0C: Not-equal (ordered, non-signaling) \n
1647/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1648/// 0x0E: Greater-than (ordered, signaling) \n
1649/// 0x0F: True (unordered, non-signaling) \n
1650/// 0x10: Equal (ordered, signaling) \n
1651/// 0x11: Less-than (ordered, non-signaling) \n
1652/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1653/// 0x13: Unordered (signaling) \n
1654/// 0x14: Not-equal (unordered, signaling) \n
1655/// 0x15: Not-less-than (unordered, non-signaling) \n
1656/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1657/// 0x17: Ordered (signaling) \n
1658/// 0x18: Equal (unordered, signaling) \n
1659/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1660/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1661/// 0x1B: False (ordered, signaling) \n
1662/// 0x1C: Not-equal (ordered, signaling) \n
1663/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1664/// 0x1E: Greater-than (ordered, non-signaling) \n
1665/// 0x1F: True (unordered, signaling)
1666/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1667/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1668
1669/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1670/// Compares each of the corresponding values of two 128-bit vectors of
1671/// [4 x float], using the operation specified by the immediate integer
1672/// operand.
1673///
1674/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1675/// If either value in a comparison is NaN, comparisons that are ordered
1676/// return false, and comparisons that are unordered return true.
1677///
1678/// \headerfile <x86intrin.h>
1679///
1680/// \code
1681/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1682/// \endcode
1683///
1684/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1685///
1686/// \param a
1687/// A 128-bit vector of [4 x float].
1688/// \param b
1689/// A 128-bit vector of [4 x float].
1690/// \param c
1691/// An immediate integer operand, with bits [4:0] specifying which comparison
1692/// operation to use: \n
1693/// 0x00: Equal (ordered, non-signaling) \n
1694/// 0x01: Less-than (ordered, signaling) \n
1695/// 0x02: Less-than-or-equal (ordered, signaling) \n
1696/// 0x03: Unordered (non-signaling) \n
1697/// 0x04: Not-equal (unordered, non-signaling) \n
1698/// 0x05: Not-less-than (unordered, signaling) \n
1699/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1700/// 0x07: Ordered (non-signaling) \n
1701/// 0x08: Equal (unordered, non-signaling) \n
1702/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1703/// 0x0A: Not-greater-than (unordered, signaling) \n
1704/// 0x0B: False (ordered, non-signaling) \n
1705/// 0x0C: Not-equal (ordered, non-signaling) \n
1706/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1707/// 0x0E: Greater-than (ordered, signaling) \n
1708/// 0x0F: True (unordered, non-signaling) \n
1709/// 0x10: Equal (ordered, signaling) \n
1710/// 0x11: Less-than (ordered, non-signaling) \n
1711/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1712/// 0x13: Unordered (signaling) \n
1713/// 0x14: Not-equal (unordered, signaling) \n
1714/// 0x15: Not-less-than (unordered, non-signaling) \n
1715/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1716/// 0x17: Ordered (signaling) \n
1717/// 0x18: Equal (unordered, signaling) \n
1718/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1719/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1720/// 0x1B: False (ordered, signaling) \n
1721/// 0x1C: Not-equal (ordered, signaling) \n
1722/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1723/// 0x1E: Greater-than (ordered, non-signaling) \n
1724/// 0x1F: True (unordered, signaling)
1725/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1726/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1727
1728/// Compares each of the corresponding double-precision values of two
1729/// 256-bit vectors of [4 x double], using the operation specified by the
1730/// immediate integer operand.
1731///
1732/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1733/// If either value in a comparison is NaN, comparisons that are ordered
1734/// return false, and comparisons that are unordered return true.
1735///
1736/// \headerfile <x86intrin.h>
1737///
1738/// \code
1739/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1740/// \endcode
1741///
1742/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1743///
1744/// \param a
1745/// A 256-bit vector of [4 x double].
1746/// \param b
1747/// A 256-bit vector of [4 x double].
1748/// \param c
1749/// An immediate integer operand, with bits [4:0] specifying which comparison
1750/// operation to use: \n
1751/// 0x00: Equal (ordered, non-signaling) \n
1752/// 0x01: Less-than (ordered, signaling) \n
1753/// 0x02: Less-than-or-equal (ordered, signaling) \n
1754/// 0x03: Unordered (non-signaling) \n
1755/// 0x04: Not-equal (unordered, non-signaling) \n
1756/// 0x05: Not-less-than (unordered, signaling) \n
1757/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1758/// 0x07: Ordered (non-signaling) \n
1759/// 0x08: Equal (unordered, non-signaling) \n
1760/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1761/// 0x0A: Not-greater-than (unordered, signaling) \n
1762/// 0x0B: False (ordered, non-signaling) \n
1763/// 0x0C: Not-equal (ordered, non-signaling) \n
1764/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1765/// 0x0E: Greater-than (ordered, signaling) \n
1766/// 0x0F: True (unordered, non-signaling) \n
1767/// 0x10: Equal (ordered, signaling) \n
1768/// 0x11: Less-than (ordered, non-signaling) \n
1769/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1770/// 0x13: Unordered (signaling) \n
1771/// 0x14: Not-equal (unordered, signaling) \n
1772/// 0x15: Not-less-than (unordered, non-signaling) \n
1773/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1774/// 0x17: Ordered (signaling) \n
1775/// 0x18: Equal (unordered, signaling) \n
1776/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1777/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1778/// 0x1B: False (ordered, signaling) \n
1779/// 0x1C: Not-equal (ordered, signaling) \n
1780/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1781/// 0x1E: Greater-than (ordered, non-signaling) \n
1782/// 0x1F: True (unordered, signaling)
1783/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1784#define _mm256_cmp_pd(a, b, c) \
1785 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1786 (__v4df)(__m256d)(b), (c)))
1787
1788/// Compares each of the corresponding values of two 256-bit vectors of
1789/// [8 x float], using the operation specified by the immediate integer
1790/// operand.
1791///
1792/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1793/// If either value in a comparison is NaN, comparisons that are ordered
1794/// return false, and comparisons that are unordered return true.
1795///
1796/// \headerfile <x86intrin.h>
1797///
1798/// \code
1799/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1800/// \endcode
1801///
1802/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1803///
1804/// \param a
1805/// A 256-bit vector of [8 x float].
1806/// \param b
1807/// A 256-bit vector of [8 x float].
1808/// \param c
1809/// An immediate integer operand, with bits [4:0] specifying which comparison
1810/// operation to use: \n
1811/// 0x00: Equal (ordered, non-signaling) \n
1812/// 0x01: Less-than (ordered, signaling) \n
1813/// 0x02: Less-than-or-equal (ordered, signaling) \n
1814/// 0x03: Unordered (non-signaling) \n
1815/// 0x04: Not-equal (unordered, non-signaling) \n
1816/// 0x05: Not-less-than (unordered, signaling) \n
1817/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1818/// 0x07: Ordered (non-signaling) \n
1819/// 0x08: Equal (unordered, non-signaling) \n
1820/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1821/// 0x0A: Not-greater-than (unordered, signaling) \n
1822/// 0x0B: False (ordered, non-signaling) \n
1823/// 0x0C: Not-equal (ordered, non-signaling) \n
1824/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1825/// 0x0E: Greater-than (ordered, signaling) \n
1826/// 0x0F: True (unordered, non-signaling) \n
1827/// 0x10: Equal (ordered, signaling) \n
1828/// 0x11: Less-than (ordered, non-signaling) \n
1829/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1830/// 0x13: Unordered (signaling) \n
1831/// 0x14: Not-equal (unordered, signaling) \n
1832/// 0x15: Not-less-than (unordered, non-signaling) \n
1833/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1834/// 0x17: Ordered (signaling) \n
1835/// 0x18: Equal (unordered, signaling) \n
1836/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1837/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1838/// 0x1B: False (ordered, signaling) \n
1839/// 0x1C: Not-equal (ordered, signaling) \n
1840/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1841/// 0x1E: Greater-than (ordered, non-signaling) \n
1842/// 0x1F: True (unordered, signaling)
1843/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1844#define _mm256_cmp_ps(a, b, c) \
1845 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1846 (__v8sf)(__m256)(b), (c)))
1847
1848/* Below intrinsic defined in emmintrin.h can be used for AVX */
1849/// Compares each of the corresponding scalar double-precision values of
1850/// two 128-bit vectors of [2 x double], using the operation specified by the
1851/// immediate integer operand.
1852///
1853/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1854/// If either value in a comparison is NaN, comparisons that are ordered
1855/// return false, and comparisons that are unordered return true.
1856///
1857/// \headerfile <x86intrin.h>
1858///
1859/// \code
1860/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1861/// \endcode
1862///
1863/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1864///
1865/// \param a
1866/// A 128-bit vector of [2 x double].
1867/// \param b
1868/// A 128-bit vector of [2 x double].
1869/// \param c
1870/// An immediate integer operand, with bits [4:0] specifying which comparison
1871/// operation to use: \n
1872/// 0x00: Equal (ordered, non-signaling) \n
1873/// 0x01: Less-than (ordered, signaling) \n
1874/// 0x02: Less-than-or-equal (ordered, signaling) \n
1875/// 0x03: Unordered (non-signaling) \n
1876/// 0x04: Not-equal (unordered, non-signaling) \n
1877/// 0x05: Not-less-than (unordered, signaling) \n
1878/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1879/// 0x07: Ordered (non-signaling) \n
1880/// 0x08: Equal (unordered, non-signaling) \n
1881/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1882/// 0x0A: Not-greater-than (unordered, signaling) \n
1883/// 0x0B: False (ordered, non-signaling) \n
1884/// 0x0C: Not-equal (ordered, non-signaling) \n
1885/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1886/// 0x0E: Greater-than (ordered, signaling) \n
1887/// 0x0F: True (unordered, non-signaling) \n
1888/// 0x10: Equal (ordered, signaling) \n
1889/// 0x11: Less-than (ordered, non-signaling) \n
1890/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1891/// 0x13: Unordered (signaling) \n
1892/// 0x14: Not-equal (unordered, signaling) \n
1893/// 0x15: Not-less-than (unordered, non-signaling) \n
1894/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1895/// 0x17: Ordered (signaling) \n
1896/// 0x18: Equal (unordered, signaling) \n
1897/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1898/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1899/// 0x1B: False (ordered, signaling) \n
1900/// 0x1C: Not-equal (ordered, signaling) \n
1901/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1902/// 0x1E: Greater-than (ordered, non-signaling) \n
1903/// 0x1F: True (unordered, signaling)
1904/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1905/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1906
1907/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1908/// Compares each of the corresponding scalar values of two 128-bit
1909/// vectors of [4 x float], using the operation specified by the immediate
1910/// integer operand.
1911///
1912/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1913/// If either value in a comparison is NaN, comparisons that are ordered
1914/// return false, and comparisons that are unordered return true.
1915///
1916/// \headerfile <x86intrin.h>
1917///
1918/// \code
1919/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1920/// \endcode
1921///
1922/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1923///
1924/// \param a
1925/// A 128-bit vector of [4 x float].
1926/// \param b
1927/// A 128-bit vector of [4 x float].
1928/// \param c
1929/// An immediate integer operand, with bits [4:0] specifying which comparison
1930/// operation to use: \n
1931/// 0x00: Equal (ordered, non-signaling) \n
1932/// 0x01: Less-than (ordered, signaling) \n
1933/// 0x02: Less-than-or-equal (ordered, signaling) \n
1934/// 0x03: Unordered (non-signaling) \n
1935/// 0x04: Not-equal (unordered, non-signaling) \n
1936/// 0x05: Not-less-than (unordered, signaling) \n
1937/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1938/// 0x07: Ordered (non-signaling) \n
1939/// 0x08: Equal (unordered, non-signaling) \n
1940/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1941/// 0x0A: Not-greater-than (unordered, signaling) \n
1942/// 0x0B: False (ordered, non-signaling) \n
1943/// 0x0C: Not-equal (ordered, non-signaling) \n
1944/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1945/// 0x0E: Greater-than (ordered, signaling) \n
1946/// 0x0F: True (unordered, non-signaling) \n
1947/// 0x10: Equal (ordered, signaling) \n
1948/// 0x11: Less-than (ordered, non-signaling) \n
1949/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1950/// 0x13: Unordered (signaling) \n
1951/// 0x14: Not-equal (unordered, signaling) \n
1952/// 0x15: Not-less-than (unordered, non-signaling) \n
1953/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1954/// 0x17: Ordered (signaling) \n
1955/// 0x18: Equal (unordered, signaling) \n
1956/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1957/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1958/// 0x1B: False (ordered, signaling) \n
1959/// 0x1C: Not-equal (ordered, signaling) \n
1960/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1961/// 0x1E: Greater-than (ordered, non-signaling) \n
1962/// 0x1F: True (unordered, signaling)
1963/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1964/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1965
1966/// Takes a [8 x i32] vector and returns the vector element value
1967/// indexed by the immediate constant operand.
1968///
1969/// \headerfile <x86intrin.h>
1970///
1971/// \code
1972/// int _mm256_extract_epi32(__m256i X, const int N);
1973/// \endcode
1974///
1975/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1976/// instruction.
1977///
1978/// \param X
1979/// A 256-bit vector of [8 x i32].
1980/// \param N
1981/// An immediate integer operand with bits [2:0] determining which vector
1982/// element is extracted and returned.
1983/// \returns A 32-bit integer containing the extracted 32 bits of extended
1984/// packed data.
1985#define _mm256_extract_epi32(X, N) \
1986 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1987
1988/// Takes a [16 x i16] vector and returns the vector element value
1989/// indexed by the immediate constant operand.
1990///
1991/// \headerfile <x86intrin.h>
1992///
1993/// \code
1994/// int _mm256_extract_epi16(__m256i X, const int N);
1995/// \endcode
1996///
1997/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1998/// instruction.
1999///
2000/// \param X
2001/// A 256-bit integer vector of [16 x i16].
2002/// \param N
2003/// An immediate integer operand with bits [3:0] determining which vector
2004/// element is extracted and returned.
2005/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2006/// packed data.
2007#define _mm256_extract_epi16(X, N) \
2008 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2009 (int)(N)))
2010
2011/// Takes a [32 x i8] vector and returns the vector element value
2012/// indexed by the immediate constant operand.
2013///
2014/// \headerfile <x86intrin.h>
2015///
2016/// \code
2017/// int _mm256_extract_epi8(__m256i X, const int N);
2018/// \endcode
2019///
2020/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2021/// instruction.
2022///
2023/// \param X
2024/// A 256-bit integer vector of [32 x i8].
2025/// \param N
2026/// An immediate integer operand with bits [4:0] determining which vector
2027/// element is extracted and returned.
2028/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2029/// packed data.
2030#define _mm256_extract_epi8(X, N) \
2031 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2032 (int)(N)))
2033
2034#ifdef __x86_64__
2035/// Takes a [4 x i64] vector and returns the vector element value
2036/// indexed by the immediate constant operand.
2037///
2038/// \headerfile <x86intrin.h>
2039///
2040/// \code
2041/// long long _mm256_extract_epi64(__m256i X, const int N);
2042/// \endcode
2043///
2044/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2045/// instruction.
2046///
2047/// \param X
2048/// A 256-bit integer vector of [4 x i64].
2049/// \param N
2050/// An immediate integer operand with bits [1:0] determining which vector
2051/// element is extracted and returned.
2052/// \returns A 64-bit integer containing the extracted 64 bits of extended
2053/// packed data.
2054#define _mm256_extract_epi64(X, N) \
2055 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2056#endif
2057
2058/// Takes a [8 x i32] vector and replaces the vector element value
2059/// indexed by the immediate constant operand by a new value. Returns the
2060/// modified vector.
2061///
2062/// \headerfile <x86intrin.h>
2063///
2064/// \code
2065/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2066/// \endcode
2067///
2068/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2069/// instruction.
2070///
2071/// \param X
2072/// A vector of [8 x i32] to be used by the insert operation.
2073/// \param I
2074/// An integer value. The replacement value for the insert operation.
2075/// \param N
2076/// An immediate integer specifying the index of the vector element to be
2077/// replaced.
2078/// \returns A copy of vector \a X, after replacing its element indexed by
2079/// \a N with \a I.
2080#define _mm256_insert_epi32(X, I, N) \
2081 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2082 (int)(I), (int)(N)))
2083
2084
2085/// Takes a [16 x i16] vector and replaces the vector element value
2086/// indexed by the immediate constant operand with a new value. Returns the
2087/// modified vector.
2088///
2089/// \headerfile <x86intrin.h>
2090///
2091/// \code
2092/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2093/// \endcode
2094///
2095/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2096/// instruction.
2097///
2098/// \param X
2099/// A vector of [16 x i16] to be used by the insert operation.
2100/// \param I
2101/// An i16 integer value. The replacement value for the insert operation.
2102/// \param N
2103/// An immediate integer specifying the index of the vector element to be
2104/// replaced.
2105/// \returns A copy of vector \a X, after replacing its element indexed by
2106/// \a N with \a I.
2107#define _mm256_insert_epi16(X, I, N) \
2108 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2109 (int)(I), (int)(N)))
2110
2111/// Takes a [32 x i8] vector and replaces the vector element value
2112/// indexed by the immediate constant operand with a new value. Returns the
2113/// modified vector.
2114///
2115/// \headerfile <x86intrin.h>
2116///
2117/// \code
2118/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2119/// \endcode
2120///
2121/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2122/// instruction.
2123///
2124/// \param X
2125/// A vector of [32 x i8] to be used by the insert operation.
2126/// \param I
2127/// An i8 integer value. The replacement value for the insert operation.
2128/// \param N
2129/// An immediate integer specifying the index of the vector element to be
2130/// replaced.
2131/// \returns A copy of vector \a X, after replacing its element indexed by
2132/// \a N with \a I.
2133#define _mm256_insert_epi8(X, I, N) \
2134 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2135 (int)(I), (int)(N)))
2136
2137#ifdef __x86_64__
2138/// Takes a [4 x i64] vector and replaces the vector element value
2139/// indexed by the immediate constant operand with a new value. Returns the
2140/// modified vector.
2141///
2142/// \headerfile <x86intrin.h>
2143///
2144/// \code
2145/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2146/// \endcode
2147///
2148/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2149/// instruction.
2150///
2151/// \param X
2152/// A vector of [4 x i64] to be used by the insert operation.
2153/// \param I
2154/// A 64-bit integer value. The replacement value for the insert operation.
2155/// \param N
2156/// An immediate integer specifying the index of the vector element to be
2157/// replaced.
2158/// \returns A copy of vector \a X, after replacing its element indexed by
2159/// \a N with \a I.
2160#define _mm256_insert_epi64(X, I, N) \
2161 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2162 (long long)(I), (int)(N)))
2163#endif
2164
2165/* Conversion */
2166/// Converts a vector of [4 x i32] into a vector of [4 x double].
2167///
2168/// \headerfile <x86intrin.h>
2169///
2170/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2171///
2172/// \param __a
2173/// A 128-bit integer vector of [4 x i32].
2174/// \returns A 256-bit vector of [4 x double] containing the converted values.
2175static __inline __m256d __DEFAULT_FN_ATTRS
2177{
2178 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2179}
2180
2181/// Converts a vector of [8 x i32] into a vector of [8 x float].
2182///
2183/// \headerfile <x86intrin.h>
2184///
2185/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2186///
2187/// \param __a
2188/// A 256-bit integer vector.
2189/// \returns A 256-bit vector of [8 x float] containing the converted values.
2190static __inline __m256 __DEFAULT_FN_ATTRS
2192{
2193 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2194}
2195
2196/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2197/// [4 x float].
2198///
2199/// \headerfile <x86intrin.h>
2200///
2201/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2202///
2203/// \param __a
2204/// A 256-bit vector of [4 x double].
2205/// \returns A 128-bit vector of [4 x float] containing the converted values.
2206static __inline __m128 __DEFAULT_FN_ATTRS
2208{
2209 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2210}
2211
2212/// Converts a vector of [8 x float] into a vector of [8 x i32].
2213///
2214/// If a converted value does not fit in a 32-bit integer, raises a
2215/// floating-point invalid exception. If the exception is masked, returns
2216/// the most negative integer.
2217///
2218/// \headerfile <x86intrin.h>
2219///
2220/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2221///
2222/// \param __a
2223/// A 256-bit vector of [8 x float].
2224/// \returns A 256-bit integer vector containing the converted values.
2225static __inline __m256i __DEFAULT_FN_ATTRS
2227{
2228 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2229}
2230
2231/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2232/// x double].
2233///
2234/// \headerfile <x86intrin.h>
2235///
2236/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2237///
2238/// \param __a
2239/// A 128-bit vector of [4 x float].
2240/// \returns A 256-bit vector of [4 x double] containing the converted values.
2241static __inline __m256d __DEFAULT_FN_ATTRS
2243{
2244 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2245}
2246
2247/// Converts a 256-bit vector of [4 x double] into four signed truncated
2248/// (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2249/// [4 x i32].
2250///
2251/// If a converted value does not fit in a 32-bit integer, raises a
2252/// floating-point invalid exception. If the exception is masked, returns
2253/// the most negative integer.
2254///
2255/// \headerfile <x86intrin.h>
2256///
2257/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2258///
2259/// \param __a
2260/// A 256-bit vector of [4 x double].
2261/// \returns A 128-bit integer vector containing the converted values.
2262static __inline __m128i __DEFAULT_FN_ATTRS
2264{
2265 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2266}
2267
2268/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2269/// [4 x i32].
2270///
2271/// If a converted value does not fit in a 32-bit integer, raises a
2272/// floating-point invalid exception. If the exception is masked, returns
2273/// the most negative integer.
2274///
2275/// \headerfile <x86intrin.h>
2276///
2277/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2278///
2279/// \param __a
2280/// A 256-bit vector of [4 x double].
2281/// \returns A 128-bit integer vector containing the converted values.
2282static __inline __m128i __DEFAULT_FN_ATTRS
2284{
2285 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2286}
2287
2288/// Converts a vector of [8 x float] into eight signed truncated (rounded
2289/// toward zero) 32-bit integers returned in a vector of [8 x i32].
2290///
2291/// If a converted value does not fit in a 32-bit integer, raises a
2292/// floating-point invalid exception. If the exception is masked, returns
2293/// the most negative integer.
2294///
2295/// \headerfile <x86intrin.h>
2296///
2297/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2298///
2299/// \param __a
2300/// A 256-bit vector of [8 x float].
2301/// \returns A 256-bit integer vector containing the converted values.
2302static __inline __m256i __DEFAULT_FN_ATTRS
2304{
2305 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2306}
2307
2308/// Returns the first element of the input vector of [4 x double].
2309///
2310/// \headerfile <x86intrin.h>
2311///
2312/// This intrinsic is a utility function and does not correspond to a specific
2313/// instruction.
2314///
2315/// \param __a
2316/// A 256-bit vector of [4 x double].
2317/// \returns A 64 bit double containing the first element of the input vector.
2318static __inline double __DEFAULT_FN_ATTRS
2320{
2321 return __a[0];
2322}
2323
2324/// Returns the first element of the input vector of [8 x i32].
2325///
2326/// \headerfile <x86intrin.h>
2327///
2328/// This intrinsic is a utility function and does not correspond to a specific
2329/// instruction.
2330///
2331/// \param __a
2332/// A 256-bit vector of [8 x i32].
2333/// \returns A 32 bit integer containing the first element of the input vector.
2334static __inline int __DEFAULT_FN_ATTRS
2336{
2337 __v8si __b = (__v8si)__a;
2338 return __b[0];
2339}
2340
2341/// Returns the first element of the input vector of [8 x float].
2342///
2343/// \headerfile <x86intrin.h>
2344///
2345/// This intrinsic is a utility function and does not correspond to a specific
2346/// instruction.
2347///
2348/// \param __a
2349/// A 256-bit vector of [8 x float].
2350/// \returns A 32 bit float containing the first element of the input vector.
2351static __inline float __DEFAULT_FN_ATTRS
2353{
2354 return __a[0];
2355}
2356
2357/* Vector replicate */
2358/// Moves and duplicates odd-indexed values from a 256-bit vector of
2359/// [8 x float] to float values in a 256-bit vector of [8 x float].
2360///
2361/// \headerfile <x86intrin.h>
2362///
2363/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2364///
2365/// \param __a
2366/// A 256-bit vector of [8 x float]. \n
2367/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2368/// the return value. \n
2369/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2370/// the return value. \n
2371/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2372/// return value. \n
2373/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2374/// return value.
2375/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2376/// values.
2377static __inline __m256 __DEFAULT_FN_ATTRS
2379{
2380 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2381}
2382
2383/// Moves and duplicates even-indexed values from a 256-bit vector of
2384/// [8 x float] to float values in a 256-bit vector of [8 x float].
2385///
2386/// \headerfile <x86intrin.h>
2387///
2388/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2389///
2390/// \param __a
2391/// A 256-bit vector of [8 x float]. \n
2392/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2393/// the return value. \n
2394/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2395/// the return value. \n
2396/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2397/// return value. \n
2398/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2399/// return value.
2400/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2401/// values.
2402static __inline __m256 __DEFAULT_FN_ATTRS
2404{
2405 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2406}
2407
2408/// Moves and duplicates double-precision floating point values from a
2409/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2410/// vector of [4 x double].
2411///
2412/// \headerfile <x86intrin.h>
2413///
2414/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2415///
2416/// \param __a
2417/// A 256-bit vector of [4 x double]. \n
2418/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2419/// return value. \n
2420/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2421/// the return value.
2422/// \returns A 256-bit vector of [4 x double] containing the moved and
2423/// duplicated values.
2424static __inline __m256d __DEFAULT_FN_ATTRS
2426{
2427 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2428}
2429
2430/* Unpack and Interleave */
2431/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2432/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2433///
2434/// \headerfile <x86intrin.h>
2435///
2436/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2437///
2438/// \param __a
2439/// A 256-bit floating-point vector of [4 x double]. \n
2440/// Bits [127:64] are written to bits [63:0] of the return value. \n
2441/// Bits [255:192] are written to bits [191:128] of the return value. \n
2442/// \param __b
2443/// A 256-bit floating-point vector of [4 x double]. \n
2444/// Bits [127:64] are written to bits [127:64] of the return value. \n
2445/// Bits [255:192] are written to bits [255:192] of the return value. \n
2446/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2447static __inline __m256d __DEFAULT_FN_ATTRS
2448_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2449{
2450 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2451}
2452
2453/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2454/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2455///
2456/// \headerfile <x86intrin.h>
2457///
2458/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2459///
2460/// \param __a
2461/// A 256-bit floating-point vector of [4 x double]. \n
2462/// Bits [63:0] are written to bits [63:0] of the return value. \n
2463/// Bits [191:128] are written to bits [191:128] of the return value.
2464/// \param __b
2465/// A 256-bit floating-point vector of [4 x double]. \n
2466/// Bits [63:0] are written to bits [127:64] of the return value. \n
2467/// Bits [191:128] are written to bits [255:192] of the return value. \n
2468/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2469static __inline __m256d __DEFAULT_FN_ATTRS
2470_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2471{
2472 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2473}
2474
2475/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2476/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2477/// vector of [8 x float].
2478///
2479/// \headerfile <x86intrin.h>
2480///
2481/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2482///
2483/// \param __a
2484/// A 256-bit vector of [8 x float]. \n
2485/// Bits [95:64] are written to bits [31:0] of the return value. \n
2486/// Bits [127:96] are written to bits [95:64] of the return value. \n
2487/// Bits [223:192] are written to bits [159:128] of the return value. \n
2488/// Bits [255:224] are written to bits [223:192] of the return value.
2489/// \param __b
2490/// A 256-bit vector of [8 x float]. \n
2491/// Bits [95:64] are written to bits [63:32] of the return value. \n
2492/// Bits [127:96] are written to bits [127:96] of the return value. \n
2493/// Bits [223:192] are written to bits [191:160] of the return value. \n
2494/// Bits [255:224] are written to bits [255:224] of the return value.
2495/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2496static __inline __m256 __DEFAULT_FN_ATTRS
2498{
2499 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2500}
2501
2502/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2503/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2504/// vector of [8 x float].
2505///
2506/// \headerfile <x86intrin.h>
2507///
2508/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2509///
2510/// \param __a
2511/// A 256-bit vector of [8 x float]. \n
2512/// Bits [31:0] are written to bits [31:0] of the return value. \n
2513/// Bits [63:32] are written to bits [95:64] of the return value. \n
2514/// Bits [159:128] are written to bits [159:128] of the return value. \n
2515/// Bits [191:160] are written to bits [223:192] of the return value.
2516/// \param __b
2517/// A 256-bit vector of [8 x float]. \n
2518/// Bits [31:0] are written to bits [63:32] of the return value. \n
2519/// Bits [63:32] are written to bits [127:96] of the return value. \n
2520/// Bits [159:128] are written to bits [191:160] of the return value. \n
2521/// Bits [191:160] are written to bits [255:224] of the return value.
2522/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2523static __inline __m256 __DEFAULT_FN_ATTRS
2525{
2526 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2527}
2528
2529/* Bit Test */
2530/// Given two 128-bit floating-point vectors of [2 x double], perform an
2531/// element-by-element comparison of the double-precision element in the
2532/// first source vector and the corresponding element in the second source
2533/// vector.
2534///
2535/// The EFLAGS register is updated as follows: \n
2536/// If there is at least one pair of double-precision elements where the
2537/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2538/// ZF flag is set to 1. \n
2539/// If there is at least one pair of double-precision elements where the
2540/// sign-bit of the first element is 0 and the sign-bit of the second element
2541/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2542/// This intrinsic returns the value of the ZF flag.
2543///
2544/// \headerfile <x86intrin.h>
2545///
2546/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2547///
2548/// \param __a
2549/// A 128-bit vector of [2 x double].
2550/// \param __b
2551/// A 128-bit vector of [2 x double].
2552/// \returns the ZF flag in the EFLAGS register.
2553static __inline int __DEFAULT_FN_ATTRS128
2554_mm_testz_pd(__m128d __a, __m128d __b)
2555{
2556 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2557}
2558
2559/// Given two 128-bit floating-point vectors of [2 x double], perform an
2560/// element-by-element comparison of the double-precision element in the
2561/// first source vector and the corresponding element in the second source
2562/// vector.
2563///
2564/// The EFLAGS register is updated as follows: \n
2565/// If there is at least one pair of double-precision elements where the
2566/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2567/// ZF flag is set to 1. \n
2568/// If there is at least one pair of double-precision elements where the
2569/// sign-bit of the first element is 0 and the sign-bit of the second element
2570/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2571/// This intrinsic returns the value of the CF flag.
2572///
2573/// \headerfile <x86intrin.h>
2574///
2575/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2576///
2577/// \param __a
2578/// A 128-bit vector of [2 x double].
2579/// \param __b
2580/// A 128-bit vector of [2 x double].
2581/// \returns the CF flag in the EFLAGS register.
2582static __inline int __DEFAULT_FN_ATTRS128
2583_mm_testc_pd(__m128d __a, __m128d __b)
2584{
2585 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2586}
2587
2588/// Given two 128-bit floating-point vectors of [2 x double], perform an
2589/// element-by-element comparison of the double-precision element in the
2590/// first source vector and the corresponding element in the second source
2591/// vector.
2592///
2593/// The EFLAGS register is updated as follows: \n
2594/// If there is at least one pair of double-precision elements where the
2595/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2596/// ZF flag is set to 1. \n
2597/// If there is at least one pair of double-precision elements where the
2598/// sign-bit of the first element is 0 and the sign-bit of the second element
2599/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2600/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2601/// otherwise it returns 0.
2602///
2603/// \headerfile <x86intrin.h>
2604///
2605/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2606///
2607/// \param __a
2608/// A 128-bit vector of [2 x double].
2609/// \param __b
2610/// A 128-bit vector of [2 x double].
2611/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2612static __inline int __DEFAULT_FN_ATTRS128
2613_mm_testnzc_pd(__m128d __a, __m128d __b)
2614{
2615 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2616}
2617
2618/// Given two 128-bit floating-point vectors of [4 x float], perform an
2619/// element-by-element comparison of the single-precision element in the
2620/// first source vector and the corresponding element in the second source
2621/// vector.
2622///
2623/// The EFLAGS register is updated as follows: \n
2624/// If there is at least one pair of single-precision elements where the
2625/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2626/// ZF flag is set to 1. \n
2627/// If there is at least one pair of single-precision elements where the
2628/// sign-bit of the first element is 0 and the sign-bit of the second element
2629/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2630/// This intrinsic returns the value of the ZF flag.
2631///
2632/// \headerfile <x86intrin.h>
2633///
2634/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2635///
2636/// \param __a
2637/// A 128-bit vector of [4 x float].
2638/// \param __b
2639/// A 128-bit vector of [4 x float].
2640/// \returns the ZF flag.
2641static __inline int __DEFAULT_FN_ATTRS128
2642_mm_testz_ps(__m128 __a, __m128 __b)
2643{
2644 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2645}
2646
2647/// Given two 128-bit floating-point vectors of [4 x float], perform an
2648/// element-by-element comparison of the single-precision element in the
2649/// first source vector and the corresponding element in the second source
2650/// vector.
2651///
2652/// The EFLAGS register is updated as follows: \n
2653/// If there is at least one pair of single-precision elements where the
2654/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2655/// ZF flag is set to 1. \n
2656/// If there is at least one pair of single-precision elements where the
2657/// sign-bit of the first element is 0 and the sign-bit of the second element
2658/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2659/// This intrinsic returns the value of the CF flag.
2660///
2661/// \headerfile <x86intrin.h>
2662///
2663/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2664///
2665/// \param __a
2666/// A 128-bit vector of [4 x float].
2667/// \param __b
2668/// A 128-bit vector of [4 x float].
2669/// \returns the CF flag.
2670static __inline int __DEFAULT_FN_ATTRS128
2671_mm_testc_ps(__m128 __a, __m128 __b)
2672{
2673 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2674}
2675
2676/// Given two 128-bit floating-point vectors of [4 x float], perform an
2677/// element-by-element comparison of the single-precision element in the
2678/// first source vector and the corresponding element in the second source
2679/// vector.
2680///
2681/// The EFLAGS register is updated as follows: \n
2682/// If there is at least one pair of single-precision elements where the
2683/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2684/// ZF flag is set to 1. \n
2685/// If there is at least one pair of single-precision elements where the
2686/// sign-bit of the first element is 0 and the sign-bit of the second element
2687/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2688/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2689/// otherwise it returns 0.
2690///
2691/// \headerfile <x86intrin.h>
2692///
2693/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2694///
2695/// \param __a
2696/// A 128-bit vector of [4 x float].
2697/// \param __b
2698/// A 128-bit vector of [4 x float].
2699/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2700static __inline int __DEFAULT_FN_ATTRS128
2701_mm_testnzc_ps(__m128 __a, __m128 __b)
2702{
2703 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2704}
2705
2706/// Given two 256-bit floating-point vectors of [4 x double], perform an
2707/// element-by-element comparison of the double-precision elements in the
2708/// first source vector and the corresponding elements in the second source
2709/// vector.
2710///
2711/// The EFLAGS register is updated as follows: \n
2712/// If there is at least one pair of double-precision elements where the
2713/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2714/// ZF flag is set to 1. \n
2715/// If there is at least one pair of double-precision elements where the
2716/// sign-bit of the first element is 0 and the sign-bit of the second element
2717/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2718/// This intrinsic returns the value of the ZF flag.
2719///
2720/// \headerfile <x86intrin.h>
2721///
2722/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2723///
2724/// \param __a
2725/// A 256-bit vector of [4 x double].
2726/// \param __b
2727/// A 256-bit vector of [4 x double].
2728/// \returns the ZF flag.
2729static __inline int __DEFAULT_FN_ATTRS
2730_mm256_testz_pd(__m256d __a, __m256d __b)
2731{
2732 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2733}
2734
2735/// Given two 256-bit floating-point vectors of [4 x double], perform an
2736/// element-by-element comparison of the double-precision elements in the
2737/// first source vector and the corresponding elements in the second source
2738/// vector.
2739///
2740/// The EFLAGS register is updated as follows: \n
2741/// If there is at least one pair of double-precision elements where the
2742/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2743/// ZF flag is set to 1. \n
2744/// If there is at least one pair of double-precision elements where the
2745/// sign-bit of the first element is 0 and the sign-bit of the second element
2746/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2747/// This intrinsic returns the value of the CF flag.
2748///
2749/// \headerfile <x86intrin.h>
2750///
2751/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2752///
2753/// \param __a
2754/// A 256-bit vector of [4 x double].
2755/// \param __b
2756/// A 256-bit vector of [4 x double].
2757/// \returns the CF flag.
2758static __inline int __DEFAULT_FN_ATTRS
2759_mm256_testc_pd(__m256d __a, __m256d __b)
2760{
2761 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2762}
2763
2764/// Given two 256-bit floating-point vectors of [4 x double], perform an
2765/// element-by-element comparison of the double-precision elements in the
2766/// first source vector and the corresponding elements in the second source
2767/// vector.
2768///
2769/// The EFLAGS register is updated as follows: \n
2770/// If there is at least one pair of double-precision elements where the
2771/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2772/// ZF flag is set to 1. \n
2773/// If there is at least one pair of double-precision elements where the
2774/// sign-bit of the first element is 0 and the sign-bit of the second element
2775/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2776/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2777/// otherwise it returns 0.
2778///
2779/// \headerfile <x86intrin.h>
2780///
2781/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2782///
2783/// \param __a
2784/// A 256-bit vector of [4 x double].
2785/// \param __b
2786/// A 256-bit vector of [4 x double].
2787/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2788static __inline int __DEFAULT_FN_ATTRS
2789_mm256_testnzc_pd(__m256d __a, __m256d __b)
2790{
2791 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2792}
2793
2794/// Given two 256-bit floating-point vectors of [8 x float], perform an
2795/// element-by-element comparison of the single-precision element in the
2796/// first source vector and the corresponding element in the second source
2797/// vector.
2798///
2799/// The EFLAGS register is updated as follows: \n
2800/// If there is at least one pair of single-precision elements where the
2801/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2802/// ZF flag is set to 1. \n
2803/// If there is at least one pair of single-precision elements where the
2804/// sign-bit of the first element is 0 and the sign-bit of the second element
2805/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2806/// This intrinsic returns the value of the ZF flag.
2807///
2808/// \headerfile <x86intrin.h>
2809///
2810/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2811///
2812/// \param __a
2813/// A 256-bit vector of [8 x float].
2814/// \param __b
2815/// A 256-bit vector of [8 x float].
2816/// \returns the ZF flag.
2817static __inline int __DEFAULT_FN_ATTRS
2818_mm256_testz_ps(__m256 __a, __m256 __b)
2819{
2820 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2821}
2822
2823/// Given two 256-bit floating-point vectors of [8 x float], perform an
2824/// element-by-element comparison of the single-precision element in the
2825/// first source vector and the corresponding element in the second source
2826/// vector.
2827///
2828/// The EFLAGS register is updated as follows: \n
2829/// If there is at least one pair of single-precision elements where the
2830/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2831/// ZF flag is set to 1. \n
2832/// If there is at least one pair of single-precision elements where the
2833/// sign-bit of the first element is 0 and the sign-bit of the second element
2834/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2835/// This intrinsic returns the value of the CF flag.
2836///
2837/// \headerfile <x86intrin.h>
2838///
2839/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2840///
2841/// \param __a
2842/// A 256-bit vector of [8 x float].
2843/// \param __b
2844/// A 256-bit vector of [8 x float].
2845/// \returns the CF flag.
2846static __inline int __DEFAULT_FN_ATTRS
2847_mm256_testc_ps(__m256 __a, __m256 __b)
2848{
2849 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2850}
2851
2852/// Given two 256-bit floating-point vectors of [8 x float], perform an
2853/// element-by-element comparison of the single-precision elements in the
2854/// first source vector and the corresponding elements in the second source
2855/// vector.
2856///
2857/// The EFLAGS register is updated as follows: \n
2858/// If there is at least one pair of single-precision elements where the
2859/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2860/// ZF flag is set to 1. \n
2861/// If there is at least one pair of single-precision elements where the
2862/// sign-bit of the first element is 0 and the sign-bit of the second element
2863/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2864/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2865/// otherwise it returns 0.
2866///
2867/// \headerfile <x86intrin.h>
2868///
2869/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2870///
2871/// \param __a
2872/// A 256-bit vector of [8 x float].
2873/// \param __b
2874/// A 256-bit vector of [8 x float].
2875/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2876static __inline int __DEFAULT_FN_ATTRS
2878{
2879 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2880}
2881
2882/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2883/// of the two source vectors.
2884///
2885/// The EFLAGS register is updated as follows: \n
2886/// If there is at least one pair of bits where both bits are 1, the ZF flag
2887/// is set to 0. Otherwise the ZF flag is set to 1. \n
2888/// If there is at least one pair of bits where the bit from the first source
2889/// vector is 0 and the bit from the second source vector is 1, the CF flag
2890/// is set to 0. Otherwise the CF flag is set to 1. \n
2891/// This intrinsic returns the value of the ZF flag.
2892///
2893/// \headerfile <x86intrin.h>
2894///
2895/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2896///
2897/// \param __a
2898/// A 256-bit integer vector.
2899/// \param __b
2900/// A 256-bit integer vector.
2901/// \returns the ZF flag.
2902static __inline int __DEFAULT_FN_ATTRS
2903_mm256_testz_si256(__m256i __a, __m256i __b)
2904{
2905 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2906}
2907
2908/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2909/// of the two source vectors.
2910///
2911/// The EFLAGS register is updated as follows: \n
2912/// If there is at least one pair of bits where both bits are 1, the ZF flag
2913/// is set to 0. Otherwise the ZF flag is set to 1. \n
2914/// If there is at least one pair of bits where the bit from the first source
2915/// vector is 0 and the bit from the second source vector is 1, the CF flag
2916/// is set to 0. Otherwise the CF flag is set to 1. \n
2917/// This intrinsic returns the value of the CF flag.
2918///
2919/// \headerfile <x86intrin.h>
2920///
2921/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2922///
2923/// \param __a
2924/// A 256-bit integer vector.
2925/// \param __b
2926/// A 256-bit integer vector.
2927/// \returns the CF flag.
2928static __inline int __DEFAULT_FN_ATTRS
2929_mm256_testc_si256(__m256i __a, __m256i __b)
2930{
2931 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2932}
2933
2934/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2935/// of the two source vectors.
2936///
2937/// The EFLAGS register is updated as follows: \n
2938/// If there is at least one pair of bits where both bits are 1, the ZF flag
2939/// is set to 0. Otherwise the ZF flag is set to 1. \n
2940/// If there is at least one pair of bits where the bit from the first source
2941/// vector is 0 and the bit from the second source vector is 1, the CF flag
2942/// is set to 0. Otherwise the CF flag is set to 1. \n
2943/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2944/// otherwise it returns 0.
2945///
2946/// \headerfile <x86intrin.h>
2947///
2948/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2949///
2950/// \param __a
2951/// A 256-bit integer vector.
2952/// \param __b
2953/// A 256-bit integer vector.
2954/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2955static __inline int __DEFAULT_FN_ATTRS
2957{
2958 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2959}
2960
2961/* Vector extract sign mask */
2962/// Extracts the sign bits of double-precision floating point elements
2963/// in a 256-bit vector of [4 x double] and writes them to the lower order
2964/// bits of the return value.
2965///
2966/// \headerfile <x86intrin.h>
2967///
2968/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2969///
2970/// \param __a
2971/// A 256-bit vector of [4 x double] containing the double-precision
2972/// floating point values with sign bits to be extracted.
2973/// \returns The sign bits from the operand, written to bits [3:0].
2974static __inline int __DEFAULT_FN_ATTRS
2976{
2977 return __builtin_ia32_movmskpd256((__v4df)__a);
2978}
2979
2980/// Extracts the sign bits of single-precision floating point elements
2981/// in a 256-bit vector of [8 x float] and writes them to the lower order
2982/// bits of the return value.
2983///
2984/// \headerfile <x86intrin.h>
2985///
2986/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2987///
2988/// \param __a
2989/// A 256-bit vector of [8 x float] containing the single-precision floating
2990/// point values with sign bits to be extracted.
2991/// \returns The sign bits from the operand, written to bits [7:0].
2992static __inline int __DEFAULT_FN_ATTRS
2994{
2995 return __builtin_ia32_movmskps256((__v8sf)__a);
2996}
2997
2998/* Vector __zero */
2999/// Zeroes the contents of all XMM or YMM registers.
3000///
3001/// \headerfile <x86intrin.h>
3002///
3003/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
3004static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3005_mm256_zeroall(void)
3006{
3007 __builtin_ia32_vzeroall();
3008}
3009
3010/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
3011///
3012/// \headerfile <x86intrin.h>
3013///
3014/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3015static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3016_mm256_zeroupper(void)
3017{
3018 __builtin_ia32_vzeroupper();
3019}
3020
3021/* Vector load with broadcast */
3022/// Loads a scalar single-precision floating point value from the
3023/// specified address pointed to by \a __a and broadcasts it to the elements
3024/// of a [4 x float] vector.
3025///
3026/// \headerfile <x86intrin.h>
3027///
3028/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3029///
3030/// \param __a
3031/// The single-precision floating point value to be broadcast.
3032/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3033/// equal to the broadcast value.
3034static __inline __m128 __DEFAULT_FN_ATTRS128
3036{
3037 struct __mm_broadcast_ss_struct {
3038 float __f;
3039 } __attribute__((__packed__, __may_alias__));
3040 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3041 return __extension__ (__m128){ __f, __f, __f, __f };
3042}
3043
3044/// Loads a scalar double-precision floating point value from the
3045/// specified address pointed to by \a __a and broadcasts it to the elements
3046/// of a [4 x double] vector.
3047///
3048/// \headerfile <x86intrin.h>
3049///
3050/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3051///
3052/// \param __a
3053/// The double-precision floating point value to be broadcast.
3054/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3055/// equal to the broadcast value.
3056static __inline __m256d __DEFAULT_FN_ATTRS
3058{
3059 struct __mm256_broadcast_sd_struct {
3060 double __d;
3061 } __attribute__((__packed__, __may_alias__));
3062 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3063 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3064}
3065
3066/// Loads a scalar single-precision floating point value from the
3067/// specified address pointed to by \a __a and broadcasts it to the elements
3068/// of a [8 x float] vector.
3069///
3070/// \headerfile <x86intrin.h>
3071///
3072/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3073///
3074/// \param __a
3075/// The single-precision floating point value to be broadcast.
3076/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3077/// equal to the broadcast value.
3078static __inline __m256 __DEFAULT_FN_ATTRS
3080{
3081 struct __mm256_broadcast_ss_struct {
3082 float __f;
3083 } __attribute__((__packed__, __may_alias__));
3084 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3085 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3086}
3087
3088/// Loads the data from a 128-bit vector of [2 x double] from the
3089/// specified address pointed to by \a __a and broadcasts it to 128-bit
3090/// elements in a 256-bit vector of [4 x double].
3091///
3092/// \headerfile <x86intrin.h>
3093///
3094/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3095///
3096/// \param __a
3097/// The 128-bit vector of [2 x double] to be broadcast.
3098/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3099/// equal to the broadcast value.
3100static __inline __m256d __DEFAULT_FN_ATTRS
3102{
3103 __m128d __b = _mm_loadu_pd((const double *)__a);
3104 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3105 0, 1, 0, 1);
3106}
3107
3108/// Loads the data from a 128-bit vector of [4 x float] from the
3109/// specified address pointed to by \a __a and broadcasts it to 128-bit
3110/// elements in a 256-bit vector of [8 x float].
3111///
3112/// \headerfile <x86intrin.h>
3113///
3114/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3115///
3116/// \param __a
3117/// The 128-bit vector of [4 x float] to be broadcast.
3118/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3119/// equal to the broadcast value.
3120static __inline __m256 __DEFAULT_FN_ATTRS
3122{
3123 __m128 __b = _mm_loadu_ps((const float *)__a);
3124 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3125 0, 1, 2, 3, 0, 1, 2, 3);
3126}
3127
3128/* SIMD load ops */
3129/// Loads 4 double-precision floating point values from a 32-byte aligned
3130/// memory location pointed to by \a __p into a vector of [4 x double].
3131///
3132/// \headerfile <x86intrin.h>
3133///
3134/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3135///
3136/// \param __p
3137/// A 32-byte aligned pointer to a memory location containing
3138/// double-precision floating point values.
3139/// \returns A 256-bit vector of [4 x double] containing the moved values.
3140static __inline __m256d __DEFAULT_FN_ATTRS
3141_mm256_load_pd(double const *__p)
3142{
3143 return *(const __m256d *)__p;
3144}
3145
3146/// Loads 8 single-precision floating point values from a 32-byte aligned
3147/// memory location pointed to by \a __p into a vector of [8 x float].
3148///
3149/// \headerfile <x86intrin.h>
3150///
3151/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3152///
3153/// \param __p
3154/// A 32-byte aligned pointer to a memory location containing float values.
3155/// \returns A 256-bit vector of [8 x float] containing the moved values.
3156static __inline __m256 __DEFAULT_FN_ATTRS
3157_mm256_load_ps(float const *__p)
3158{
3159 return *(const __m256 *)__p;
3160}
3161
3162/// Loads 4 double-precision floating point values from an unaligned
3163/// memory location pointed to by \a __p into a vector of [4 x double].
3164///
3165/// \headerfile <x86intrin.h>
3166///
3167/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3168///
3169/// \param __p
3170/// A pointer to a memory location containing double-precision floating
3171/// point values.
3172/// \returns A 256-bit vector of [4 x double] containing the moved values.
3173static __inline __m256d __DEFAULT_FN_ATTRS
3174_mm256_loadu_pd(double const *__p)
3175{
3176 struct __loadu_pd {
3177 __m256d_u __v;
3178 } __attribute__((__packed__, __may_alias__));
3179 return ((const struct __loadu_pd*)__p)->__v;
3180}
3181
3182/// Loads 8 single-precision floating point values from an unaligned
3183/// memory location pointed to by \a __p into a vector of [8 x float].
3184///
3185/// \headerfile <x86intrin.h>
3186///
3187/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3188///
3189/// \param __p
3190/// A pointer to a memory location containing single-precision floating
3191/// point values.
3192/// \returns A 256-bit vector of [8 x float] containing the moved values.
3193static __inline __m256 __DEFAULT_FN_ATTRS
3195{
3196 struct __loadu_ps {
3197 __m256_u __v;
3198 } __attribute__((__packed__, __may_alias__));
3199 return ((const struct __loadu_ps*)__p)->__v;
3200}
3201
3202/// Loads 256 bits of integer data from a 32-byte aligned memory
3203/// location pointed to by \a __p into elements of a 256-bit integer vector.
3204///
3205/// \headerfile <x86intrin.h>
3206///
3207/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3208///
3209/// \param __p
3210/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3211/// values.
3212/// \returns A 256-bit integer vector containing the moved values.
3213static __inline __m256i __DEFAULT_FN_ATTRS
3214_mm256_load_si256(__m256i const *__p)
3215{
3216 return *__p;
3217}
3218
3219/// Loads 256 bits of integer data from an unaligned memory location
3220/// pointed to by \a __p into a 256-bit integer vector.
3221///
3222/// \headerfile <x86intrin.h>
3223///
3224/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3225///
3226/// \param __p
3227/// A pointer to a 256-bit integer vector containing integer values.
3228/// \returns A 256-bit integer vector containing the moved values.
3229static __inline __m256i __DEFAULT_FN_ATTRS
3230_mm256_loadu_si256(__m256i_u const *__p)
3231{
3232 struct __loadu_si256 {
3233 __m256i_u __v;
3234 } __attribute__((__packed__, __may_alias__));
3235 return ((const struct __loadu_si256*)__p)->__v;
3236}
3237
3238/// Loads 256 bits of integer data from an unaligned memory location
3239/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3240/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3241/// line boundary.
3242///
3243/// \headerfile <x86intrin.h>
3244///
3245/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3246///
3247/// \param __p
3248/// A pointer to a 256-bit integer vector containing integer values.
3249/// \returns A 256-bit integer vector containing the moved values.
3250static __inline __m256i __DEFAULT_FN_ATTRS
3251_mm256_lddqu_si256(__m256i_u const *__p)
3252{
3253 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3254}
3255
3256/* SIMD store ops */
3257/// Stores double-precision floating point values from a 256-bit vector
3258/// of [4 x double] to a 32-byte aligned memory location pointed to by
3259/// \a __p.
3260///
3261/// \headerfile <x86intrin.h>
3262///
3263/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3264///
3265/// \param __p
3266/// A 32-byte aligned pointer to a memory location that will receive the
3267/// double-precision floaing point values.
3268/// \param __a
3269/// A 256-bit vector of [4 x double] containing the values to be moved.
3270static __inline void __DEFAULT_FN_ATTRS
3271_mm256_store_pd(double *__p, __m256d __a)
3272{
3273 *(__m256d *)__p = __a;
3274}
3275
3276/// Stores single-precision floating point values from a 256-bit vector
3277/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3278///
3279/// \headerfile <x86intrin.h>
3280///
3281/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3282///
3283/// \param __p
3284/// A 32-byte aligned pointer to a memory location that will receive the
3285/// float values.
3286/// \param __a
3287/// A 256-bit vector of [8 x float] containing the values to be moved.
3288static __inline void __DEFAULT_FN_ATTRS
3289_mm256_store_ps(float *__p, __m256 __a)
3290{
3291 *(__m256 *)__p = __a;
3292}
3293
3294/// Stores double-precision floating point values from a 256-bit vector
3295/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3296///
3297/// \headerfile <x86intrin.h>
3298///
3299/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3300///
3301/// \param __p
3302/// A pointer to a memory location that will receive the double-precision
3303/// floating point values.
3304/// \param __a
3305/// A 256-bit vector of [4 x double] containing the values to be moved.
3306static __inline void __DEFAULT_FN_ATTRS
3307_mm256_storeu_pd(double *__p, __m256d __a)
3308{
3309 struct __storeu_pd {
3310 __m256d_u __v;
3311 } __attribute__((__packed__, __may_alias__));
3312 ((struct __storeu_pd*)__p)->__v = __a;
3313}
3314
3315/// Stores single-precision floating point values from a 256-bit vector
3316/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3317///
3318/// \headerfile <x86intrin.h>
3319///
3320/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3321///
3322/// \param __p
3323/// A pointer to a memory location that will receive the float values.
3324/// \param __a
3325/// A 256-bit vector of [8 x float] containing the values to be moved.
3326static __inline void __DEFAULT_FN_ATTRS
3327_mm256_storeu_ps(float *__p, __m256 __a)
3328{
3329 struct __storeu_ps {
3330 __m256_u __v;
3331 } __attribute__((__packed__, __may_alias__));
3332 ((struct __storeu_ps*)__p)->__v = __a;
3333}
3334
3335/// Stores integer values from a 256-bit integer vector to a 32-byte
3336/// aligned memory location pointed to by \a __p.
3337///
3338/// \headerfile <x86intrin.h>
3339///
3340/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3341///
3342/// \param __p
3343/// A 32-byte aligned pointer to a memory location that will receive the
3344/// integer values.
3345/// \param __a
3346/// A 256-bit integer vector containing the values to be moved.
3347static __inline void __DEFAULT_FN_ATTRS
3348_mm256_store_si256(__m256i *__p, __m256i __a)
3349{
3350 *__p = __a;
3351}
3352
3353/// Stores integer values from a 256-bit integer vector to an unaligned
3354/// memory location pointed to by \a __p.
3355///
3356/// \headerfile <x86intrin.h>
3357///
3358/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3359///
3360/// \param __p
3361/// A pointer to a memory location that will receive the integer values.
3362/// \param __a
3363/// A 256-bit integer vector containing the values to be moved.
3364static __inline void __DEFAULT_FN_ATTRS
3365_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3366{
3367 struct __storeu_si256 {
3368 __m256i_u __v;
3369 } __attribute__((__packed__, __may_alias__));
3370 ((struct __storeu_si256*)__p)->__v = __a;
3371}
3372
3373/* Conditional load ops */
3374/// Conditionally loads double-precision floating point elements from a
3375/// memory location pointed to by \a __p into a 128-bit vector of
3376/// [2 x double], depending on the mask bits associated with each data
3377/// element.
3378///
3379/// \headerfile <x86intrin.h>
3380///
3381/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3382///
3383/// \param __p
3384/// A pointer to a memory location that contains the double-precision
3385/// floating point values.
3386/// \param __m
3387/// A 128-bit integer vector containing the mask. The most significant bit of
3388/// each data element represents the mask bits. If a mask bit is zero, the
3389/// corresponding value in the memory location is not loaded and the
3390/// corresponding field in the return value is set to zero.
3391/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3392static __inline __m128d __DEFAULT_FN_ATTRS128
3393_mm_maskload_pd(double const *__p, __m128i __m)
3394{
3395 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3396}
3397
3398/// Conditionally loads double-precision floating point elements from a
3399/// memory location pointed to by \a __p into a 256-bit vector of
3400/// [4 x double], depending on the mask bits associated with each data
3401/// element.
3402///
3403/// \headerfile <x86intrin.h>
3404///
3405/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3406///
3407/// \param __p
3408/// A pointer to a memory location that contains the double-precision
3409/// floating point values.
3410/// \param __m
3411/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3412/// significant bit of each quadword element represents the mask bits. If a
3413/// mask bit is zero, the corresponding value in the memory location is not
3414/// loaded and the corresponding field in the return value is set to zero.
3415/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3416static __inline __m256d __DEFAULT_FN_ATTRS
3417_mm256_maskload_pd(double const *__p, __m256i __m)
3418{
3419 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3420 (__v4di)__m);
3421}
3422
3423/// Conditionally loads single-precision floating point elements from a
3424/// memory location pointed to by \a __p into a 128-bit vector of
3425/// [4 x float], depending on the mask bits associated with each data
3426/// element.
3427///
3428/// \headerfile <x86intrin.h>
3429///
3430/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3431///
3432/// \param __p
3433/// A pointer to a memory location that contains the single-precision
3434/// floating point values.
3435/// \param __m
3436/// A 128-bit integer vector containing the mask. The most significant bit of
3437/// each data element represents the mask bits. If a mask bit is zero, the
3438/// corresponding value in the memory location is not loaded and the
3439/// corresponding field in the return value is set to zero.
3440/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3441static __inline __m128 __DEFAULT_FN_ATTRS128
3442_mm_maskload_ps(float const *__p, __m128i __m)
3443{
3444 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3445}
3446
3447/// Conditionally loads single-precision floating point elements from a
3448/// memory location pointed to by \a __p into a 256-bit vector of
3449/// [8 x float], depending on the mask bits associated with each data
3450/// element.
3451///
3452/// \headerfile <x86intrin.h>
3453///
3454/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3455///
3456/// \param __p
3457/// A pointer to a memory location that contains the single-precision
3458/// floating point values.
3459/// \param __m
3460/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3461/// significant bit of each dword element represents the mask bits. If a mask
3462/// bit is zero, the corresponding value in the memory location is not loaded
3463/// and the corresponding field in the return value is set to zero.
3464/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3465static __inline __m256 __DEFAULT_FN_ATTRS
3466_mm256_maskload_ps(float const *__p, __m256i __m)
3467{
3468 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3469}
3470
3471/* Conditional store ops */
3472/// Moves single-precision floating point values from a 256-bit vector
3473/// of [8 x float] to a memory location pointed to by \a __p, according to
3474/// the specified mask.
3475///
3476/// \headerfile <x86intrin.h>
3477///
3478/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3479///
3480/// \param __p
3481/// A pointer to a memory location that will receive the float values.
3482/// \param __m
3483/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3484/// significant bit of each dword element in the mask vector represents the
3485/// mask bits. If a mask bit is zero, the corresponding value from vector
3486/// \a __a is not stored and the corresponding field in the memory location
3487/// pointed to by \a __p is not changed.
3488/// \param __a
3489/// A 256-bit vector of [8 x float] containing the values to be stored.
3490static __inline void __DEFAULT_FN_ATTRS
3491_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3492{
3493 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3494}
3495
3496/// Moves double-precision values from a 128-bit vector of [2 x double]
3497/// to a memory location pointed to by \a __p, according to the specified
3498/// mask.
3499///
3500/// \headerfile <x86intrin.h>
3501///
3502/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3503///
3504/// \param __p
3505/// A pointer to a memory location that will receive the float values.
3506/// \param __m
3507/// A 128-bit integer vector containing the mask. The most significant bit of
3508/// each field in the mask vector represents the mask bits. If a mask bit is
3509/// zero, the corresponding value from vector \a __a is not stored and the
3510/// corresponding field in the memory location pointed to by \a __p is not
3511/// changed.
3512/// \param __a
3513/// A 128-bit vector of [2 x double] containing the values to be stored.
3514static __inline void __DEFAULT_FN_ATTRS128
3515_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3516{
3517 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3518}
3519
3520/// Moves double-precision values from a 256-bit vector of [4 x double]
3521/// to a memory location pointed to by \a __p, according to the specified
3522/// mask.
3523///
3524/// \headerfile <x86intrin.h>
3525///
3526/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3527///
3528/// \param __p
3529/// A pointer to a memory location that will receive the float values.
3530/// \param __m
3531/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3532/// significant bit of each quadword element in the mask vector represents
3533/// the mask bits. If a mask bit is zero, the corresponding value from vector
3534/// __a is not stored and the corresponding field in the memory location
3535/// pointed to by \a __p is not changed.
3536/// \param __a
3537/// A 256-bit vector of [4 x double] containing the values to be stored.
3538static __inline void __DEFAULT_FN_ATTRS
3539_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3540{
3541 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3542}
3543
3544/// Moves single-precision floating point values from a 128-bit vector
3545/// of [4 x float] to a memory location pointed to by \a __p, according to
3546/// the specified mask.
3547///
3548/// \headerfile <x86intrin.h>
3549///
3550/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3551///
3552/// \param __p
3553/// A pointer to a memory location that will receive the float values.
3554/// \param __m
3555/// A 128-bit integer vector containing the mask. The most significant bit of
3556/// each field in the mask vector represents the mask bits. If a mask bit is
3557/// zero, the corresponding value from vector __a is not stored and the
3558/// corresponding field in the memory location pointed to by \a __p is not
3559/// changed.
3560/// \param __a
3561/// A 128-bit vector of [4 x float] containing the values to be stored.
3562static __inline void __DEFAULT_FN_ATTRS128
3563_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3564{
3565 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3566}
3567
3568/* Cacheability support ops */
3569/// Moves integer data from a 256-bit integer vector to a 32-byte
3570/// aligned memory location. To minimize caching, the data is flagged as
3571/// non-temporal (unlikely to be used again soon).
3572///
3573/// \headerfile <x86intrin.h>
3574///
3575/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3576///
3577/// \param __a
3578/// A pointer to a 32-byte aligned memory location that will receive the
3579/// integer values.
3580/// \param __b
3581/// A 256-bit integer vector containing the values to be moved.
3582static __inline void __DEFAULT_FN_ATTRS
3584{
3585 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3586 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3587}
3588
3589/// Moves double-precision values from a 256-bit vector of [4 x double]
3590/// to a 32-byte aligned memory location. To minimize caching, the data is
3591/// flagged as non-temporal (unlikely to be used again soon).
3592///
3593/// \headerfile <x86intrin.h>
3594///
3595/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3596///
3597/// \param __a
3598/// A pointer to a 32-byte aligned memory location that will receive the
3599/// double-precision floating-point values.
3600/// \param __b
3601/// A 256-bit vector of [4 x double] containing the values to be moved.
3602static __inline void __DEFAULT_FN_ATTRS
3603_mm256_stream_pd(void *__a, __m256d __b)
3604{
3605 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3606 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3607}
3608
3609/// Moves single-precision floating point values from a 256-bit vector
3610/// of [8 x float] to a 32-byte aligned memory location. To minimize
3611/// caching, the data is flagged as non-temporal (unlikely to be used again
3612/// soon).
3613///
3614/// \headerfile <x86intrin.h>
3615///
3616/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3617///
3618/// \param __p
3619/// A pointer to a 32-byte aligned memory location that will receive the
3620/// single-precision floating point values.
3621/// \param __a
3622/// A 256-bit vector of [8 x float] containing the values to be moved.
3623static __inline void __DEFAULT_FN_ATTRS
3625{
3626 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3627 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3628}
3629
3630/* Create vectors */
3631/// Create a 256-bit vector of [4 x double] with undefined values.
3632///
3633/// \headerfile <x86intrin.h>
3634///
3635/// This intrinsic has no corresponding instruction.
3636///
3637/// \returns A 256-bit vector of [4 x double] containing undefined values.
3638static __inline__ __m256d __DEFAULT_FN_ATTRS
3640{
3641 return (__m256d)__builtin_ia32_undef256();
3642}
3643
3644/// Create a 256-bit vector of [8 x float] with undefined values.
3645///
3646/// \headerfile <x86intrin.h>
3647///
3648/// This intrinsic has no corresponding instruction.
3649///
3650/// \returns A 256-bit vector of [8 x float] containing undefined values.
3651static __inline__ __m256 __DEFAULT_FN_ATTRS
3653{
3654 return (__m256)__builtin_ia32_undef256();
3655}
3656
3657/// Create a 256-bit integer vector with undefined values.
3658///
3659/// \headerfile <x86intrin.h>
3660///
3661/// This intrinsic has no corresponding instruction.
3662///
3663/// \returns A 256-bit integer vector containing undefined values.
3664static __inline__ __m256i __DEFAULT_FN_ATTRS
3666{
3667 return (__m256i)__builtin_ia32_undef256();
3668}
3669
3670/// Constructs a 256-bit floating-point vector of [4 x double]
3671/// initialized with the specified double-precision floating-point values.
3672///
3673/// \headerfile <x86intrin.h>
3674///
3675/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3676/// instruction.
3677///
3678/// \param __a
3679/// A double-precision floating-point value used to initialize bits [255:192]
3680/// of the result.
3681/// \param __b
3682/// A double-precision floating-point value used to initialize bits [191:128]
3683/// of the result.
3684/// \param __c
3685/// A double-precision floating-point value used to initialize bits [127:64]
3686/// of the result.
3687/// \param __d
3688/// A double-precision floating-point value used to initialize bits [63:0]
3689/// of the result.
3690/// \returns An initialized 256-bit floating-point vector of [4 x double].
3691static __inline __m256d __DEFAULT_FN_ATTRS
3692_mm256_set_pd(double __a, double __b, double __c, double __d)
3693{
3694 return __extension__ (__m256d){ __d, __c, __b, __a };
3695}
3696
3697/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3698/// with the specified single-precision floating-point values.
3699///
3700/// \headerfile <x86intrin.h>
3701///
3702/// This intrinsic is a utility function and does not correspond to a specific
3703/// instruction.
3704///
3705/// \param __a
3706/// A single-precision floating-point value used to initialize bits [255:224]
3707/// of the result.
3708/// \param __b
3709/// A single-precision floating-point value used to initialize bits [223:192]
3710/// of the result.
3711/// \param __c
3712/// A single-precision floating-point value used to initialize bits [191:160]
3713/// of the result.
3714/// \param __d
3715/// A single-precision floating-point value used to initialize bits [159:128]
3716/// of the result.
3717/// \param __e
3718/// A single-precision floating-point value used to initialize bits [127:96]
3719/// of the result.
3720/// \param __f
3721/// A single-precision floating-point value used to initialize bits [95:64]
3722/// of the result.
3723/// \param __g
3724/// A single-precision floating-point value used to initialize bits [63:32]
3725/// of the result.
3726/// \param __h
3727/// A single-precision floating-point value used to initialize bits [31:0]
3728/// of the result.
3729/// \returns An initialized 256-bit floating-point vector of [8 x float].
3730static __inline __m256 __DEFAULT_FN_ATTRS
3731_mm256_set_ps(float __a, float __b, float __c, float __d,
3732 float __e, float __f, float __g, float __h)
3733{
3734 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3735}
3736
3737/// Constructs a 256-bit integer vector initialized with the specified
3738/// 32-bit integral values.
3739///
3740/// \headerfile <x86intrin.h>
3741///
3742/// This intrinsic is a utility function and does not correspond to a specific
3743/// instruction.
3744///
3745/// \param __i0
3746/// A 32-bit integral value used to initialize bits [255:224] of the result.
3747/// \param __i1
3748/// A 32-bit integral value used to initialize bits [223:192] of the result.
3749/// \param __i2
3750/// A 32-bit integral value used to initialize bits [191:160] of the result.
3751/// \param __i3
3752/// A 32-bit integral value used to initialize bits [159:128] of the result.
3753/// \param __i4
3754/// A 32-bit integral value used to initialize bits [127:96] of the result.
3755/// \param __i5
3756/// A 32-bit integral value used to initialize bits [95:64] of the result.
3757/// \param __i6
3758/// A 32-bit integral value used to initialize bits [63:32] of the result.
3759/// \param __i7
3760/// A 32-bit integral value used to initialize bits [31:0] of the result.
3761/// \returns An initialized 256-bit integer vector.
3762static __inline __m256i __DEFAULT_FN_ATTRS
3763_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3764 int __i4, int __i5, int __i6, int __i7)
3765{
3766 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3767}
3768
3769/// Constructs a 256-bit integer vector initialized with the specified
3770/// 16-bit integral values.
3771///
3772/// \headerfile <x86intrin.h>
3773///
3774/// This intrinsic is a utility function and does not correspond to a specific
3775/// instruction.
3776///
3777/// \param __w15
3778/// A 16-bit integral value used to initialize bits [255:240] of the result.
3779/// \param __w14
3780/// A 16-bit integral value used to initialize bits [239:224] of the result.
3781/// \param __w13
3782/// A 16-bit integral value used to initialize bits [223:208] of the result.
3783/// \param __w12
3784/// A 16-bit integral value used to initialize bits [207:192] of the result.
3785/// \param __w11
3786/// A 16-bit integral value used to initialize bits [191:176] of the result.
3787/// \param __w10
3788/// A 16-bit integral value used to initialize bits [175:160] of the result.
3789/// \param __w09
3790/// A 16-bit integral value used to initialize bits [159:144] of the result.
3791/// \param __w08
3792/// A 16-bit integral value used to initialize bits [143:128] of the result.
3793/// \param __w07
3794/// A 16-bit integral value used to initialize bits [127:112] of the result.
3795/// \param __w06
3796/// A 16-bit integral value used to initialize bits [111:96] of the result.
3797/// \param __w05
3798/// A 16-bit integral value used to initialize bits [95:80] of the result.
3799/// \param __w04
3800/// A 16-bit integral value used to initialize bits [79:64] of the result.
3801/// \param __w03
3802/// A 16-bit integral value used to initialize bits [63:48] of the result.
3803/// \param __w02
3804/// A 16-bit integral value used to initialize bits [47:32] of the result.
3805/// \param __w01
3806/// A 16-bit integral value used to initialize bits [31:16] of the result.
3807/// \param __w00
3808/// A 16-bit integral value used to initialize bits [15:0] of the result.
3809/// \returns An initialized 256-bit integer vector.
3810static __inline __m256i __DEFAULT_FN_ATTRS
3811_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3812 short __w11, short __w10, short __w09, short __w08,
3813 short __w07, short __w06, short __w05, short __w04,
3814 short __w03, short __w02, short __w01, short __w00)
3815{
3816 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3817 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3818}
3819
3820/// Constructs a 256-bit integer vector initialized with the specified
3821/// 8-bit integral values.
3822///
3823/// \headerfile <x86intrin.h>
3824///
3825/// This intrinsic is a utility function and does not correspond to a specific
3826/// instruction.
3827///
3828/// \param __b31
3829/// An 8-bit integral value used to initialize bits [255:248] of the result.
3830/// \param __b30
3831/// An 8-bit integral value used to initialize bits [247:240] of the result.
3832/// \param __b29
3833/// An 8-bit integral value used to initialize bits [239:232] of the result.
3834/// \param __b28
3835/// An 8-bit integral value used to initialize bits [231:224] of the result.
3836/// \param __b27
3837/// An 8-bit integral value used to initialize bits [223:216] of the result.
3838/// \param __b26
3839/// An 8-bit integral value used to initialize bits [215:208] of the result.
3840/// \param __b25
3841/// An 8-bit integral value used to initialize bits [207:200] of the result.
3842/// \param __b24
3843/// An 8-bit integral value used to initialize bits [199:192] of the result.
3844/// \param __b23
3845/// An 8-bit integral value used to initialize bits [191:184] of the result.
3846/// \param __b22
3847/// An 8-bit integral value used to initialize bits [183:176] of the result.
3848/// \param __b21
3849/// An 8-bit integral value used to initialize bits [175:168] of the result.
3850/// \param __b20
3851/// An 8-bit integral value used to initialize bits [167:160] of the result.
3852/// \param __b19
3853/// An 8-bit integral value used to initialize bits [159:152] of the result.
3854/// \param __b18
3855/// An 8-bit integral value used to initialize bits [151:144] of the result.
3856/// \param __b17
3857/// An 8-bit integral value used to initialize bits [143:136] of the result.
3858/// \param __b16
3859/// An 8-bit integral value used to initialize bits [135:128] of the result.
3860/// \param __b15
3861/// An 8-bit integral value used to initialize bits [127:120] of the result.
3862/// \param __b14
3863/// An 8-bit integral value used to initialize bits [119:112] of the result.
3864/// \param __b13
3865/// An 8-bit integral value used to initialize bits [111:104] of the result.
3866/// \param __b12
3867/// An 8-bit integral value used to initialize bits [103:96] of the result.
3868/// \param __b11
3869/// An 8-bit integral value used to initialize bits [95:88] of the result.
3870/// \param __b10
3871/// An 8-bit integral value used to initialize bits [87:80] of the result.
3872/// \param __b09
3873/// An 8-bit integral value used to initialize bits [79:72] of the result.
3874/// \param __b08
3875/// An 8-bit integral value used to initialize bits [71:64] of the result.
3876/// \param __b07
3877/// An 8-bit integral value used to initialize bits [63:56] of the result.
3878/// \param __b06
3879/// An 8-bit integral value used to initialize bits [55:48] of the result.
3880/// \param __b05
3881/// An 8-bit integral value used to initialize bits [47:40] of the result.
3882/// \param __b04
3883/// An 8-bit integral value used to initialize bits [39:32] of the result.
3884/// \param __b03
3885/// An 8-bit integral value used to initialize bits [31:24] of the result.
3886/// \param __b02
3887/// An 8-bit integral value used to initialize bits [23:16] of the result.
3888/// \param __b01
3889/// An 8-bit integral value used to initialize bits [15:8] of the result.
3890/// \param __b00
3891/// An 8-bit integral value used to initialize bits [7:0] of the result.
3892/// \returns An initialized 256-bit integer vector.
3893static __inline __m256i __DEFAULT_FN_ATTRS
3894_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3895 char __b27, char __b26, char __b25, char __b24,
3896 char __b23, char __b22, char __b21, char __b20,
3897 char __b19, char __b18, char __b17, char __b16,
3898 char __b15, char __b14, char __b13, char __b12,
3899 char __b11, char __b10, char __b09, char __b08,
3900 char __b07, char __b06, char __b05, char __b04,
3901 char __b03, char __b02, char __b01, char __b00)
3902{
3903 return __extension__ (__m256i)(__v32qi){
3904 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3905 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3906 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3907 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3908 };
3909}
3910
3911/// Constructs a 256-bit integer vector initialized with the specified
3912/// 64-bit integral values.
3913///
3914/// \headerfile <x86intrin.h>
3915///
3916/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3917/// instruction.
3918///
3919/// \param __a
3920/// A 64-bit integral value used to initialize bits [255:192] of the result.
3921/// \param __b
3922/// A 64-bit integral value used to initialize bits [191:128] of the result.
3923/// \param __c
3924/// A 64-bit integral value used to initialize bits [127:64] of the result.
3925/// \param __d
3926/// A 64-bit integral value used to initialize bits [63:0] of the result.
3927/// \returns An initialized 256-bit integer vector.
3928static __inline __m256i __DEFAULT_FN_ATTRS
3929_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3930{
3931 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3932}
3933
3934/* Create vectors with elements in reverse order */
3935/// Constructs a 256-bit floating-point vector of [4 x double],
3936/// initialized in reverse order with the specified double-precision
3937/// floating-point values.
3938///
3939/// \headerfile <x86intrin.h>
3940///
3941/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3942/// instruction.
3943///
3944/// \param __a
3945/// A double-precision floating-point value used to initialize bits [63:0]
3946/// of the result.
3947/// \param __b
3948/// A double-precision floating-point value used to initialize bits [127:64]
3949/// of the result.
3950/// \param __c
3951/// A double-precision floating-point value used to initialize bits [191:128]
3952/// of the result.
3953/// \param __d
3954/// A double-precision floating-point value used to initialize bits [255:192]
3955/// of the result.
3956/// \returns An initialized 256-bit floating-point vector of [4 x double].
3957static __inline __m256d __DEFAULT_FN_ATTRS
3958_mm256_setr_pd(double __a, double __b, double __c, double __d)
3959{
3960 return _mm256_set_pd(__d, __c, __b, __a);
3961}
3962
3963/// Constructs a 256-bit floating-point vector of [8 x float],
3964/// initialized in reverse order with the specified single-precision
3965/// float-point values.
3966///
3967/// \headerfile <x86intrin.h>
3968///
3969/// This intrinsic is a utility function and does not correspond to a specific
3970/// instruction.
3971///
3972/// \param __a
3973/// A single-precision floating-point value used to initialize bits [31:0]
3974/// of the result.
3975/// \param __b
3976/// A single-precision floating-point value used to initialize bits [63:32]
3977/// of the result.
3978/// \param __c
3979/// A single-precision floating-point value used to initialize bits [95:64]
3980/// of the result.
3981/// \param __d
3982/// A single-precision floating-point value used to initialize bits [127:96]
3983/// of the result.
3984/// \param __e
3985/// A single-precision floating-point value used to initialize bits [159:128]
3986/// of the result.
3987/// \param __f
3988/// A single-precision floating-point value used to initialize bits [191:160]
3989/// of the result.
3990/// \param __g
3991/// A single-precision floating-point value used to initialize bits [223:192]
3992/// of the result.
3993/// \param __h
3994/// A single-precision floating-point value used to initialize bits [255:224]
3995/// of the result.
3996/// \returns An initialized 256-bit floating-point vector of [8 x float].
3997static __inline __m256 __DEFAULT_FN_ATTRS
3998_mm256_setr_ps(float __a, float __b, float __c, float __d,
3999 float __e, float __f, float __g, float __h)
4000{
4001 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
4002}
4003
4004/// Constructs a 256-bit integer vector, initialized in reverse order
4005/// with the specified 32-bit integral values.
4006///
4007/// \headerfile <x86intrin.h>
4008///
4009/// This intrinsic is a utility function and does not correspond to a specific
4010/// instruction.
4011///
4012/// \param __i0
4013/// A 32-bit integral value used to initialize bits [31:0] of the result.
4014/// \param __i1
4015/// A 32-bit integral value used to initialize bits [63:32] of the result.
4016/// \param __i2
4017/// A 32-bit integral value used to initialize bits [95:64] of the result.
4018/// \param __i3
4019/// A 32-bit integral value used to initialize bits [127:96] of the result.
4020/// \param __i4
4021/// A 32-bit integral value used to initialize bits [159:128] of the result.
4022/// \param __i5
4023/// A 32-bit integral value used to initialize bits [191:160] of the result.
4024/// \param __i6
4025/// A 32-bit integral value used to initialize bits [223:192] of the result.
4026/// \param __i7
4027/// A 32-bit integral value used to initialize bits [255:224] of the result.
4028/// \returns An initialized 256-bit integer vector.
4029static __inline __m256i __DEFAULT_FN_ATTRS
4030_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4031 int __i4, int __i5, int __i6, int __i7)
4032{
4033 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
4034}
4035
4036/// Constructs a 256-bit integer vector, initialized in reverse order
4037/// with the specified 16-bit integral values.
4038///
4039/// \headerfile <x86intrin.h>
4040///
4041/// This intrinsic is a utility function and does not correspond to a specific
4042/// instruction.
4043///
4044/// \param __w15
4045/// A 16-bit integral value used to initialize bits [15:0] of the result.
4046/// \param __w14
4047/// A 16-bit integral value used to initialize bits [31:16] of the result.
4048/// \param __w13
4049/// A 16-bit integral value used to initialize bits [47:32] of the result.
4050/// \param __w12
4051/// A 16-bit integral value used to initialize bits [63:48] of the result.
4052/// \param __w11
4053/// A 16-bit integral value used to initialize bits [79:64] of the result.
4054/// \param __w10
4055/// A 16-bit integral value used to initialize bits [95:80] of the result.
4056/// \param __w09
4057/// A 16-bit integral value used to initialize bits [111:96] of the result.
4058/// \param __w08
4059/// A 16-bit integral value used to initialize bits [127:112] of the result.
4060/// \param __w07
4061/// A 16-bit integral value used to initialize bits [143:128] of the result.
4062/// \param __w06
4063/// A 16-bit integral value used to initialize bits [159:144] of the result.
4064/// \param __w05
4065/// A 16-bit integral value used to initialize bits [175:160] of the result.
4066/// \param __w04
4067/// A 16-bit integral value used to initialize bits [191:176] of the result.
4068/// \param __w03
4069/// A 16-bit integral value used to initialize bits [207:192] of the result.
4070/// \param __w02
4071/// A 16-bit integral value used to initialize bits [223:208] of the result.
4072/// \param __w01
4073/// A 16-bit integral value used to initialize bits [239:224] of the result.
4074/// \param __w00
4075/// A 16-bit integral value used to initialize bits [255:240] of the result.
4076/// \returns An initialized 256-bit integer vector.
4077static __inline __m256i __DEFAULT_FN_ATTRS
4078_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4079 short __w11, short __w10, short __w09, short __w08,
4080 short __w07, short __w06, short __w05, short __w04,
4081 short __w03, short __w02, short __w01, short __w00)
4082{
4083 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4084 __w04, __w05, __w06, __w07,
4085 __w08, __w09, __w10, __w11,
4086 __w12, __w13, __w14, __w15);
4087}
4088
4089/// Constructs a 256-bit integer vector, initialized in reverse order
4090/// with the specified 8-bit integral values.
4091///
4092/// \headerfile <x86intrin.h>
4093///
4094/// This intrinsic is a utility function and does not correspond to a specific
4095/// instruction.
4096///
4097/// \param __b31
4098/// An 8-bit integral value used to initialize bits [7:0] of the result.
4099/// \param __b30
4100/// An 8-bit integral value used to initialize bits [15:8] of the result.
4101/// \param __b29
4102/// An 8-bit integral value used to initialize bits [23:16] of the result.
4103/// \param __b28
4104/// An 8-bit integral value used to initialize bits [31:24] of the result.
4105/// \param __b27
4106/// An 8-bit integral value used to initialize bits [39:32] of the result.
4107/// \param __b26
4108/// An 8-bit integral value used to initialize bits [47:40] of the result.
4109/// \param __b25
4110/// An 8-bit integral value used to initialize bits [55:48] of the result.
4111/// \param __b24
4112/// An 8-bit integral value used to initialize bits [63:56] of the result.
4113/// \param __b23
4114/// An 8-bit integral value used to initialize bits [71:64] of the result.
4115/// \param __b22
4116/// An 8-bit integral value used to initialize bits [79:72] of the result.
4117/// \param __b21
4118/// An 8-bit integral value used to initialize bits [87:80] of the result.
4119/// \param __b20
4120/// An 8-bit integral value used to initialize bits [95:88] of the result.
4121/// \param __b19
4122/// An 8-bit integral value used to initialize bits [103:96] of the result.
4123/// \param __b18
4124/// An 8-bit integral value used to initialize bits [111:104] of the result.
4125/// \param __b17
4126/// An 8-bit integral value used to initialize bits [119:112] of the result.
4127/// \param __b16
4128/// An 8-bit integral value used to initialize bits [127:120] of the result.
4129/// \param __b15
4130/// An 8-bit integral value used to initialize bits [135:128] of the result.
4131/// \param __b14
4132/// An 8-bit integral value used to initialize bits [143:136] of the result.
4133/// \param __b13
4134/// An 8-bit integral value used to initialize bits [151:144] of the result.
4135/// \param __b12
4136/// An 8-bit integral value used to initialize bits [159:152] of the result.
4137/// \param __b11
4138/// An 8-bit integral value used to initialize bits [167:160] of the result.
4139/// \param __b10
4140/// An 8-bit integral value used to initialize bits [175:168] of the result.
4141/// \param __b09
4142/// An 8-bit integral value used to initialize bits [183:176] of the result.
4143/// \param __b08
4144/// An 8-bit integral value used to initialize bits [191:184] of the result.
4145/// \param __b07
4146/// An 8-bit integral value used to initialize bits [199:192] of the result.
4147/// \param __b06
4148/// An 8-bit integral value used to initialize bits [207:200] of the result.
4149/// \param __b05
4150/// An 8-bit integral value used to initialize bits [215:208] of the result.
4151/// \param __b04
4152/// An 8-bit integral value used to initialize bits [223:216] of the result.
4153/// \param __b03
4154/// An 8-bit integral value used to initialize bits [231:224] of the result.
4155/// \param __b02
4156/// An 8-bit integral value used to initialize bits [239:232] of the result.
4157/// \param __b01
4158/// An 8-bit integral value used to initialize bits [247:240] of the result.
4159/// \param __b00
4160/// An 8-bit integral value used to initialize bits [255:248] of the result.
4161/// \returns An initialized 256-bit integer vector.
4162static __inline __m256i __DEFAULT_FN_ATTRS
4163_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4164 char __b27, char __b26, char __b25, char __b24,
4165 char __b23, char __b22, char __b21, char __b20,
4166 char __b19, char __b18, char __b17, char __b16,
4167 char __b15, char __b14, char __b13, char __b12,
4168 char __b11, char __b10, char __b09, char __b08,
4169 char __b07, char __b06, char __b05, char __b04,
4170 char __b03, char __b02, char __b01, char __b00)
4171{
4172 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4173 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4174 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4175 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4176}
4177
4178/// Constructs a 256-bit integer vector, initialized in reverse order
4179/// with the specified 64-bit integral values.
4180///
4181/// \headerfile <x86intrin.h>
4182///
4183/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4184/// instruction.
4185///
4186/// \param __a
4187/// A 64-bit integral value used to initialize bits [63:0] of the result.
4188/// \param __b
4189/// A 64-bit integral value used to initialize bits [127:64] of the result.
4190/// \param __c
4191/// A 64-bit integral value used to initialize bits [191:128] of the result.
4192/// \param __d
4193/// A 64-bit integral value used to initialize bits [255:192] of the result.
4194/// \returns An initialized 256-bit integer vector.
4195static __inline __m256i __DEFAULT_FN_ATTRS
4196_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4197{
4198 return _mm256_set_epi64x(__d, __c, __b, __a);
4199}
4200
4201/* Create vectors with repeated elements */
4202/// Constructs a 256-bit floating-point vector of [4 x double], with each
4203/// of the four double-precision floating-point vector elements set to the
4204/// specified double-precision floating-point value.
4205///
4206/// \headerfile <x86intrin.h>
4207///
4208/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4209///
4210/// \param __w
4211/// A double-precision floating-point value used to initialize each vector
4212/// element of the result.
4213/// \returns An initialized 256-bit floating-point vector of [4 x double].
4214static __inline __m256d __DEFAULT_FN_ATTRS
4216{
4217 return _mm256_set_pd(__w, __w, __w, __w);
4218}
4219
4220/// Constructs a 256-bit floating-point vector of [8 x float], with each
4221/// of the eight single-precision floating-point vector elements set to the
4222/// specified single-precision floating-point value.
4223///
4224/// \headerfile <x86intrin.h>
4225///
4226/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4227/// instruction.
4228///
4229/// \param __w
4230/// A single-precision floating-point value used to initialize each vector
4231/// element of the result.
4232/// \returns An initialized 256-bit floating-point vector of [8 x float].
4233static __inline __m256 __DEFAULT_FN_ATTRS
4235{
4236 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4237}
4238
4239/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4240/// 32-bit integral vector elements set to the specified 32-bit integral
4241/// value.
4242///
4243/// \headerfile <x86intrin.h>
4244///
4245/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4246/// instruction.
4247///
4248/// \param __i
4249/// A 32-bit integral value used to initialize each vector element of the
4250/// result.
4251/// \returns An initialized 256-bit integer vector of [8 x i32].
4252static __inline __m256i __DEFAULT_FN_ATTRS
4254{
4255 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4256}
4257
4258/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4259/// 16-bit integral vector elements set to the specified 16-bit integral
4260/// value.
4261///
4262/// \headerfile <x86intrin.h>
4263///
4264/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4265///
4266/// \param __w
4267/// A 16-bit integral value used to initialize each vector element of the
4268/// result.
4269/// \returns An initialized 256-bit integer vector of [16 x i16].
4270static __inline __m256i __DEFAULT_FN_ATTRS
4272{
4273 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4274 __w, __w, __w, __w, __w, __w, __w, __w);
4275}
4276
4277/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4278/// 8-bit integral vector elements set to the specified 8-bit integral value.
4279///
4280/// \headerfile <x86intrin.h>
4281///
4282/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4283///
4284/// \param __b
4285/// An 8-bit integral value used to initialize each vector element of the
4286/// result.
4287/// \returns An initialized 256-bit integer vector of [32 x i8].
4288static __inline __m256i __DEFAULT_FN_ATTRS
4290{
4291 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4292 __b, __b, __b, __b, __b, __b, __b, __b,
4293 __b, __b, __b, __b, __b, __b, __b, __b,
4294 __b, __b, __b, __b, __b, __b, __b, __b);
4295}
4296
4297/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4298/// 64-bit integral vector elements set to the specified 64-bit integral
4299/// value.
4300///
4301/// \headerfile <x86intrin.h>
4302///
4303/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4304///
4305/// \param __q
4306/// A 64-bit integral value used to initialize each vector element of the
4307/// result.
4308/// \returns An initialized 256-bit integer vector of [4 x i64].
4309static __inline __m256i __DEFAULT_FN_ATTRS
4311{
4312 return _mm256_set_epi64x(__q, __q, __q, __q);
4313}
4314
4315/* Create __zeroed vectors */
4316/// Constructs a 256-bit floating-point vector of [4 x double] with all
4317/// vector elements initialized to zero.
4318///
4319/// \headerfile <x86intrin.h>
4320///
4321/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4322///
4323/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4324static __inline __m256d __DEFAULT_FN_ATTRS
4326{
4327 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
4328}
4329
4330/// Constructs a 256-bit floating-point vector of [8 x float] with all
4331/// vector elements initialized to zero.
4332///
4333/// \headerfile <x86intrin.h>
4334///
4335/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4336///
4337/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4338static __inline __m256 __DEFAULT_FN_ATTRS
4340{
4341 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4342}
4343
4344/// Constructs a 256-bit integer vector initialized to zero.
4345///
4346/// \headerfile <x86intrin.h>
4347///
4348/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4349///
4350/// \returns A 256-bit integer vector initialized to zero.
4351static __inline __m256i __DEFAULT_FN_ATTRS
4353{
4354 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4355}
4356
4357/* Cast between vector types */
4358/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4359/// floating-point vector of [8 x float].
4360///
4361/// \headerfile <x86intrin.h>
4362///
4363/// This intrinsic has no corresponding instruction.
4364///
4365/// \param __a
4366/// A 256-bit floating-point vector of [4 x double].
4367/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4368/// bitwise pattern as the parameter.
4369static __inline __m256 __DEFAULT_FN_ATTRS
4371{
4372 return (__m256)__a;
4373}
4374
4375/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4376/// integer vector.
4377///
4378/// \headerfile <x86intrin.h>
4379///
4380/// This intrinsic has no corresponding instruction.
4381///
4382/// \param __a
4383/// A 256-bit floating-point vector of [4 x double].
4384/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4385/// parameter.
4386static __inline __m256i __DEFAULT_FN_ATTRS
4388{
4389 return (__m256i)__a;
4390}
4391
4392/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4393/// floating-point vector of [4 x double].
4394///
4395/// \headerfile <x86intrin.h>
4396///
4397/// This intrinsic has no corresponding instruction.
4398///
4399/// \param __a
4400/// A 256-bit floating-point vector of [8 x float].
4401/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4402/// bitwise pattern as the parameter.
4403static __inline __m256d __DEFAULT_FN_ATTRS
4405{
4406 return (__m256d)__a;
4407}
4408
4409/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4410/// integer vector.
4411///
4412/// \headerfile <x86intrin.h>
4413///
4414/// This intrinsic has no corresponding instruction.
4415///
4416/// \param __a
4417/// A 256-bit floating-point vector of [8 x float].
4418/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4419/// parameter.
4420static __inline __m256i __DEFAULT_FN_ATTRS
4422{
4423 return (__m256i)__a;
4424}
4425
4426/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4427/// of [8 x float].
4428///
4429/// \headerfile <x86intrin.h>
4430///
4431/// This intrinsic has no corresponding instruction.
4432///
4433/// \param __a
4434/// A 256-bit integer vector.
4435/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4436/// bitwise pattern as the parameter.
4437static __inline __m256 __DEFAULT_FN_ATTRS
4439{
4440 return (__m256)__a;
4441}
4442
4443/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4444/// of [4 x double].
4445///
4446/// \headerfile <x86intrin.h>
4447///
4448/// This intrinsic has no corresponding instruction.
4449///
4450/// \param __a
4451/// A 256-bit integer vector.
4452/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4453/// bitwise pattern as the parameter.
4454static __inline __m256d __DEFAULT_FN_ATTRS
4456{
4457 return (__m256d)__a;
4458}
4459
4460/// Returns the lower 128 bits of a 256-bit floating-point vector of
4461/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4462///
4463/// \headerfile <x86intrin.h>
4464///
4465/// This intrinsic has no corresponding instruction.
4466///
4467/// \param __a
4468/// A 256-bit floating-point vector of [4 x double].
4469/// \returns A 128-bit floating-point vector of [2 x double] containing the
4470/// lower 128 bits of the parameter.
4471static __inline __m128d __DEFAULT_FN_ATTRS
4473{
4474 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4475}
4476
4477/// Returns the lower 128 bits of a 256-bit floating-point vector of
4478/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4479///
4480/// \headerfile <x86intrin.h>
4481///
4482/// This intrinsic has no corresponding instruction.
4483///
4484/// \param __a
4485/// A 256-bit floating-point vector of [8 x float].
4486/// \returns A 128-bit floating-point vector of [4 x float] containing the
4487/// lower 128 bits of the parameter.
4488static __inline __m128 __DEFAULT_FN_ATTRS
4490{
4491 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4492}
4493
4494/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4495///
4496/// \headerfile <x86intrin.h>
4497///
4498/// This intrinsic has no corresponding instruction.
4499///
4500/// \param __a
4501/// A 256-bit integer vector.
4502/// \returns A 128-bit integer vector containing the lower 128 bits of the
4503/// parameter.
4504static __inline __m128i __DEFAULT_FN_ATTRS
4506{
4507 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4508}
4509
4510/// Constructs a 256-bit floating-point vector of [4 x double] from a
4511/// 128-bit floating-point vector of [2 x double].
4512///
4513/// The lower 128 bits contain the value of the source vector. The contents
4514/// of the upper 128 bits are undefined.
4515///
4516/// \headerfile <x86intrin.h>
4517///
4518/// This intrinsic has no corresponding instruction.
4519///
4520/// \param __a
4521/// A 128-bit vector of [2 x double].
4522/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4523/// contain the value of the parameter. The contents of the upper 128 bits
4524/// are undefined.
4525static __inline __m256d __DEFAULT_FN_ATTRS
4527{
4528 return __builtin_shufflevector(
4529 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4530}
4531
4532/// Constructs a 256-bit floating-point vector of [8 x float] from a
4533/// 128-bit floating-point vector of [4 x float].
4534///
4535/// The lower 128 bits contain the value of the source vector. The contents
4536/// of the upper 128 bits are undefined.
4537///
4538/// \headerfile <x86intrin.h>
4539///
4540/// This intrinsic has no corresponding instruction.
4541///
4542/// \param __a
4543/// A 128-bit vector of [4 x float].
4544/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4545/// contain the value of the parameter. The contents of the upper 128 bits
4546/// are undefined.
4547static __inline __m256 __DEFAULT_FN_ATTRS
4549{
4550 return __builtin_shufflevector((__v4sf)__a,
4551 (__v4sf)__builtin_nondeterministic_value(__a),
4552 0, 1, 2, 3, 4, 5, 6, 7);
4553}
4554
4555/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4556///
4557/// The lower 128 bits contain the value of the source vector. The contents
4558/// of the upper 128 bits are undefined.
4559///
4560/// \headerfile <x86intrin.h>
4561///
4562/// This intrinsic has no corresponding instruction.
4563///
4564/// \param __a
4565/// A 128-bit integer vector.
4566/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4567/// the parameter. The contents of the upper 128 bits are undefined.
4568static __inline __m256i __DEFAULT_FN_ATTRS
4570{
4571 return __builtin_shufflevector(
4572 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4573}
4574
4575/// Constructs a 256-bit floating-point vector of [4 x double] from a
4576/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4577/// contain the value of the source vector. The upper 128 bits are set
4578/// to zero.
4579///
4580/// \headerfile <x86intrin.h>
4581///
4582/// This intrinsic has no corresponding instruction.
4583///
4584/// \param __a
4585/// A 128-bit vector of [2 x double].
4586/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4587/// contain the value of the parameter. The upper 128 bits are set to zero.
4588static __inline __m256d __DEFAULT_FN_ATTRS
4590{
4591 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4592}
4593
4594/// Constructs a 256-bit floating-point vector of [8 x float] from a
4595/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4596/// the value of the source vector. The upper 128 bits are set to zero.
4597///
4598/// \headerfile <x86intrin.h>
4599///
4600/// This intrinsic has no corresponding instruction.
4601///
4602/// \param __a
4603/// A 128-bit vector of [4 x float].
4604/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4605/// contain the value of the parameter. The upper 128 bits are set to zero.
4606static __inline __m256 __DEFAULT_FN_ATTRS
4608{
4609 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4610}
4611
4612/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4613/// The lower 128 bits contain the value of the source vector. The upper
4614/// 128 bits are set to zero.
4615///
4616/// \headerfile <x86intrin.h>
4617///
4618/// This intrinsic has no corresponding instruction.
4619///
4620/// \param __a
4621/// A 128-bit integer vector.
4622/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4623/// the parameter. The upper 128 bits are set to zero.
4624static __inline __m256i __DEFAULT_FN_ATTRS
4626{
4627 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4628}
4629
4630/*
4631 Vector insert.
4632 We use macros rather than inlines because we only want to accept
4633 invocations where the immediate M is a constant expression.
4634*/
4635/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4636/// a 256-bit vector of [8 x float] given in the first parameter, and then
4637/// replacing either the upper or the lower 128 bits with the contents of a
4638/// 128-bit vector of [4 x float] in the second parameter.
4639///
4640/// The immediate integer parameter determines between the upper or the lower
4641/// 128 bits.
4642///
4643/// \headerfile <x86intrin.h>
4644///
4645/// \code
4646/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4647/// \endcode
4648///
4649/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4650///
4651/// \param V1
4652/// A 256-bit vector of [8 x float]. This vector is copied to the result
4653/// first, and then either the upper or the lower 128 bits of the result will
4654/// be replaced by the contents of \a V2.
4655/// \param V2
4656/// A 128-bit vector of [4 x float]. The contents of this parameter are
4657/// written to either the upper or the lower 128 bits of the result depending
4658/// on the value of parameter \a M.
4659/// \param M
4660/// An immediate integer. The least significant bit determines how the values
4661/// from the two parameters are interleaved: \n
4662/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4663/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4664/// result. \n
4665/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4666/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4667/// result.
4668/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4669#define _mm256_insertf128_ps(V1, V2, M) \
4670 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4671 (__v4sf)(__m128)(V2), (int)(M)))
4672
4673/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4674/// a 256-bit vector of [4 x double] given in the first parameter, and then
4675/// replacing either the upper or the lower 128 bits with the contents of a
4676/// 128-bit vector of [2 x double] in the second parameter.
4677///
4678/// The immediate integer parameter determines between the upper or the lower
4679/// 128 bits.
4680///
4681/// \headerfile <x86intrin.h>
4682///
4683/// \code
4684/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4685/// \endcode
4686///
4687/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4688///
4689/// \param V1
4690/// A 256-bit vector of [4 x double]. This vector is copied to the result
4691/// first, and then either the upper or the lower 128 bits of the result will
4692/// be replaced by the contents of \a V2.
4693/// \param V2
4694/// A 128-bit vector of [2 x double]. The contents of this parameter are
4695/// written to either the upper or the lower 128 bits of the result depending
4696/// on the value of parameter \a M.
4697/// \param M
4698/// An immediate integer. The least significant bit determines how the values
4699/// from the two parameters are interleaved: \n
4700/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4701/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4702/// result. \n
4703/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4704/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4705/// result.
4706/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4707#define _mm256_insertf128_pd(V1, V2, M) \
4708 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4709 (__v2df)(__m128d)(V2), (int)(M)))
4710
4711/// Constructs a new 256-bit integer vector by first duplicating a
4712/// 256-bit integer vector given in the first parameter, and then replacing
4713/// either the upper or the lower 128 bits with the contents of a 128-bit
4714/// integer vector in the second parameter.
4715///
4716/// The immediate integer parameter determines between the upper or the lower
4717/// 128 bits.
4718///
4719/// \headerfile <x86intrin.h>
4720///
4721/// \code
4722/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4723/// \endcode
4724///
4725/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4726///
4727/// \param V1
4728/// A 256-bit integer vector. This vector is copied to the result first, and
4729/// then either the upper or the lower 128 bits of the result will be
4730/// replaced by the contents of \a V2.
4731/// \param V2
4732/// A 128-bit integer vector. The contents of this parameter are written to
4733/// either the upper or the lower 128 bits of the result depending on the
4734/// value of parameter \a M.
4735/// \param M
4736/// An immediate integer. The least significant bit determines how the values
4737/// from the two parameters are interleaved: \n
4738/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4739/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4740/// result. \n
4741/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4742/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4743/// result.
4744/// \returns A 256-bit integer vector containing the interleaved values.
4745#define _mm256_insertf128_si256(V1, V2, M) \
4746 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4747 (__v4si)(__m128i)(V2), (int)(M)))
4748
4749/*
4750 Vector extract.
4751 We use macros rather than inlines because we only want to accept
4752 invocations where the immediate M is a constant expression.
4753*/
4754/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4755/// of [8 x float], as determined by the immediate integer parameter, and
4756/// returns the extracted bits as a 128-bit vector of [4 x float].
4757///
4758/// \headerfile <x86intrin.h>
4759///
4760/// \code
4761/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4762/// \endcode
4763///
4764/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4765///
4766/// \param V
4767/// A 256-bit vector of [8 x float].
4768/// \param M
4769/// An immediate integer. The least significant bit determines which bits are
4770/// extracted from the first parameter: \n
4771/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4772/// result. \n
4773/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4774/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4775#define _mm256_extractf128_ps(V, M) \
4776 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4777
4778/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4779/// of [4 x double], as determined by the immediate integer parameter, and
4780/// returns the extracted bits as a 128-bit vector of [2 x double].
4781///
4782/// \headerfile <x86intrin.h>
4783///
4784/// \code
4785/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4786/// \endcode
4787///
4788/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4789///
4790/// \param V
4791/// A 256-bit vector of [4 x double].
4792/// \param M
4793/// An immediate integer. The least significant bit determines which bits are
4794/// extracted from the first parameter: \n
4795/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4796/// result. \n
4797/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4798/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4799#define _mm256_extractf128_pd(V, M) \
4800 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4801
4802/// Extracts either the upper or the lower 128 bits from a 256-bit
4803/// integer vector, as determined by the immediate integer parameter, and
4804/// returns the extracted bits as a 128-bit integer vector.
4805///
4806/// \headerfile <x86intrin.h>
4807///
4808/// \code
4809/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4810/// \endcode
4811///
4812/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4813///
4814/// \param V
4815/// A 256-bit integer vector.
4816/// \param M
4817/// An immediate integer. The least significant bit determines which bits are
4818/// extracted from the first parameter: \n
4819/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4820/// result. \n
4821/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4822/// \returns A 128-bit integer vector containing the extracted bits.
4823#define _mm256_extractf128_si256(V, M) \
4824 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4825
4826/// Constructs a 256-bit floating-point vector of [8 x float] by
4827/// concatenating two 128-bit floating-point vectors of [4 x float].
4828///
4829/// \headerfile <x86intrin.h>
4830///
4831/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4832///
4833/// \param __hi
4834/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4835/// 128 bits of the result.
4836/// \param __lo
4837/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4838/// 128 bits of the result.
4839/// \returns A 256-bit floating-point vector of [8 x float] containing the
4840/// concatenated result.
4841static __inline __m256 __DEFAULT_FN_ATTRS
4842_mm256_set_m128 (__m128 __hi, __m128 __lo)
4843{
4844 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4845}
4846
4847/// Constructs a 256-bit floating-point vector of [4 x double] by
4848/// concatenating two 128-bit floating-point vectors of [2 x double].
4849///
4850/// \headerfile <x86intrin.h>
4851///
4852/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4853///
4854/// \param __hi
4855/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4856/// 128 bits of the result.
4857/// \param __lo
4858/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4859/// 128 bits of the result.
4860/// \returns A 256-bit floating-point vector of [4 x double] containing the
4861/// concatenated result.
4862static __inline __m256d __DEFAULT_FN_ATTRS
4863_mm256_set_m128d (__m128d __hi, __m128d __lo)
4864{
4865 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4866}
4867
4868/// Constructs a 256-bit integer vector by concatenating two 128-bit
4869/// integer vectors.
4870///
4871/// \headerfile <x86intrin.h>
4872///
4873/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4874///
4875/// \param __hi
4876/// A 128-bit integer vector to be copied to the upper 128 bits of the
4877/// result.
4878/// \param __lo
4879/// A 128-bit integer vector to be copied to the lower 128 bits of the
4880/// result.
4881/// \returns A 256-bit integer vector containing the concatenated result.
4882static __inline __m256i __DEFAULT_FN_ATTRS
4883_mm256_set_m128i (__m128i __hi, __m128i __lo)
4884{
4885 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4886}
4887
4888/// Constructs a 256-bit floating-point vector of [8 x float] by
4889/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4890/// similar to _mm256_set_m128, but the order of the input parameters is
4891/// swapped.
4892///
4893/// \headerfile <x86intrin.h>
4894///
4895/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4896///
4897/// \param __lo
4898/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4899/// 128 bits of the result.
4900/// \param __hi
4901/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4902/// 128 bits of the result.
4903/// \returns A 256-bit floating-point vector of [8 x float] containing the
4904/// concatenated result.
4905static __inline __m256 __DEFAULT_FN_ATTRS
4906_mm256_setr_m128 (__m128 __lo, __m128 __hi)
4907{
4908 return _mm256_set_m128(__hi, __lo);
4909}
4910
4911/// Constructs a 256-bit floating-point vector of [4 x double] by
4912/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4913/// similar to _mm256_set_m128d, but the order of the input parameters is
4914/// swapped.
4915///
4916/// \headerfile <x86intrin.h>
4917///
4918/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4919///
4920/// \param __lo
4921/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4922/// 128 bits of the result.
4923/// \param __hi
4924/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4925/// 128 bits of the result.
4926/// \returns A 256-bit floating-point vector of [4 x double] containing the
4927/// concatenated result.
4928static __inline __m256d __DEFAULT_FN_ATTRS
4929_mm256_setr_m128d (__m128d __lo, __m128d __hi)
4930{
4931 return (__m256d)_mm256_set_m128d(__hi, __lo);
4932}
4933
4934/// Constructs a 256-bit integer vector by concatenating two 128-bit
4935/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4936/// the input parameters is swapped.
4937///
4938/// \headerfile <x86intrin.h>
4939///
4940/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4941///
4942/// \param __lo
4943/// A 128-bit integer vector to be copied to the lower 128 bits of the
4944/// result.
4945/// \param __hi
4946/// A 128-bit integer vector to be copied to the upper 128 bits of the
4947/// result.
4948/// \returns A 256-bit integer vector containing the concatenated result.
4949static __inline __m256i __DEFAULT_FN_ATTRS
4950_mm256_setr_m128i (__m128i __lo, __m128i __hi)
4951{
4952 return (__m256i)_mm256_set_m128i(__hi, __lo);
4953}
4954
4955/* SIMD load ops (unaligned) */
4956/// Loads two 128-bit floating-point vectors of [4 x float] from
4957/// unaligned memory locations and constructs a 256-bit floating-point vector
4958/// of [8 x float] by concatenating the two 128-bit vectors.
4959///
4960/// \headerfile <x86intrin.h>
4961///
4962/// This intrinsic corresponds to load instructions followed by the
4963/// <c> VINSERTF128 </c> instruction.
4964///
4965/// \param __addr_hi
4966/// A pointer to a 128-bit memory location containing 4 consecutive
4967/// single-precision floating-point values. These values are to be copied to
4968/// bits[255:128] of the result. The address of the memory location does not
4969/// have to be aligned.
4970/// \param __addr_lo
4971/// A pointer to a 128-bit memory location containing 4 consecutive
4972/// single-precision floating-point values. These values are to be copied to
4973/// bits[127:0] of the result. The address of the memory location does not
4974/// have to be aligned.
4975/// \returns A 256-bit floating-point vector of [8 x float] containing the
4976/// concatenated result.
4977static __inline __m256 __DEFAULT_FN_ATTRS
4978_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4979{
4980 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4981}
4982
4983/// Loads two 128-bit floating-point vectors of [2 x double] from
4984/// unaligned memory locations and constructs a 256-bit floating-point vector
4985/// of [4 x double] by concatenating the two 128-bit vectors.
4986///
4987/// \headerfile <x86intrin.h>
4988///
4989/// This intrinsic corresponds to load instructions followed by the
4990/// <c> VINSERTF128 </c> instruction.
4991///
4992/// \param __addr_hi
4993/// A pointer to a 128-bit memory location containing two consecutive
4994/// double-precision floating-point values. These values are to be copied to
4995/// bits[255:128] of the result. The address of the memory location does not
4996/// have to be aligned.
4997/// \param __addr_lo
4998/// A pointer to a 128-bit memory location containing two consecutive
4999/// double-precision floating-point values. These values are to be copied to
5000/// bits[127:0] of the result. The address of the memory location does not
5001/// have to be aligned.
5002/// \returns A 256-bit floating-point vector of [4 x double] containing the
5003/// concatenated result.
5004static __inline __m256d __DEFAULT_FN_ATTRS
5005_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
5006{
5007 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
5008}
5009
5010/// Loads two 128-bit integer vectors from unaligned memory locations and
5011/// constructs a 256-bit integer vector by concatenating the two 128-bit
5012/// vectors.
5013///
5014/// \headerfile <x86intrin.h>
5015///
5016/// This intrinsic corresponds to load instructions followed by the
5017/// <c> VINSERTF128 </c> instruction.
5018///
5019/// \param __addr_hi
5020/// A pointer to a 128-bit memory location containing a 128-bit integer
5021/// vector. This vector is to be copied to bits[255:128] of the result. The
5022/// address of the memory location does not have to be aligned.
5023/// \param __addr_lo
5024/// A pointer to a 128-bit memory location containing a 128-bit integer
5025/// vector. This vector is to be copied to bits[127:0] of the result. The
5026/// address of the memory location does not have to be aligned.
5027/// \returns A 256-bit integer vector containing the concatenated result.
5028static __inline __m256i __DEFAULT_FN_ATTRS
5029_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
5030{
5031 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
5032}
5033
5034/* SIMD store ops (unaligned) */
5035/// Stores the upper and lower 128 bits of a 256-bit floating-point
5036/// vector of [8 x float] into two different unaligned memory locations.
5037///
5038/// \headerfile <x86intrin.h>
5039///
5040/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5041/// store instructions.
5042///
5043/// \param __addr_hi
5044/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5045/// copied to this memory location. The address of this memory location does
5046/// not have to be aligned.
5047/// \param __addr_lo
5048/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5049/// copied to this memory location. The address of this memory location does
5050/// not have to be aligned.
5051/// \param __a
5052/// A 256-bit floating-point vector of [8 x float].
5053static __inline void __DEFAULT_FN_ATTRS
5054_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
5055{
5056 __m128 __v128;
5057
5058 __v128 = _mm256_castps256_ps128(__a);
5059 _mm_storeu_ps(__addr_lo, __v128);
5060 __v128 = _mm256_extractf128_ps(__a, 1);
5061 _mm_storeu_ps(__addr_hi, __v128);
5062}
5063
5064/// Stores the upper and lower 128 bits of a 256-bit floating-point
5065/// vector of [4 x double] into two different unaligned memory locations.
5066///
5067/// \headerfile <x86intrin.h>
5068///
5069/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5070/// store instructions.
5071///
5072/// \param __addr_hi
5073/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5074/// copied to this memory location. The address of this memory location does
5075/// not have to be aligned.
5076/// \param __addr_lo
5077/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5078/// copied to this memory location. The address of this memory location does
5079/// not have to be aligned.
5080/// \param __a
5081/// A 256-bit floating-point vector of [4 x double].
5082static __inline void __DEFAULT_FN_ATTRS
5083_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5084{
5085 __m128d __v128;
5086
5087 __v128 = _mm256_castpd256_pd128(__a);
5088 _mm_storeu_pd(__addr_lo, __v128);
5089 __v128 = _mm256_extractf128_pd(__a, 1);
5090 _mm_storeu_pd(__addr_hi, __v128);
5091}
5092
5093/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5094/// two different unaligned memory locations.
5095///
5096/// \headerfile <x86intrin.h>
5097///
5098/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5099/// store instructions.
5100///
5101/// \param __addr_hi
5102/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5103/// copied to this memory location. The address of this memory location does
5104/// not have to be aligned.
5105/// \param __addr_lo
5106/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5107/// copied to this memory location. The address of this memory location does
5108/// not have to be aligned.
5109/// \param __a
5110/// A 256-bit integer vector.
5111static __inline void __DEFAULT_FN_ATTRS
5112_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5113{
5114 __m128i __v128;
5115
5116 __v128 = _mm256_castsi256_si128(__a);
5117 _mm_storeu_si128(__addr_lo, __v128);
5118 __v128 = _mm256_extractf128_si256(__a, 1);
5119 _mm_storeu_si128(__addr_hi, __v128);
5120}
5121
5122#undef __DEFAULT_FN_ATTRS
5123#undef __DEFAULT_FN_ATTRS128
5124
5125#endif /* __AVXINTRIN_H */
__device__ _Float16
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3057
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4842
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition: avxintrin.h:3101
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition: avxintrin.h:744
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition: avxintrin.h:3307
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2929
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition: avxintrin.h:92
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition: avxintrin.h:3603
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition: avxintrin.h:4438
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition: avxintrin.h:3121
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: avxintrin.h:4163
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition: avxintrin.h:2283
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition: avxintrin.h:3271
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4625
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition: avxintrin.h:3327
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition: avxintrin.h:2470
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition: avxintrin.h:4978
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:356
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3417
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition: avxintrin.h:4234
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition: avxintrin.h:4421
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:656
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition: avxintrin.h:2191
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:390
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition: avxintrin.h:2403
static __inline __m128d __DEFAULT_FN_ATTRS _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition: avxintrin.h:4472
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition: avxintrin.h:2975
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition: avxintrin.h:2207
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition: avxintrin.h:4339
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition: avxintrin.h:3652
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2877
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition: avxintrin.h:4489
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3442
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3393
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition: avxintrin.h:4289
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition: avxintrin.h:981
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition: avxintrin.h:2242
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition: avxintrin.h:3365
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition: avxintrin.h:4775
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition: avxintrin.h:4823
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition: avxintrin.h:3214
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2956
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition: avxintrin.h:1405
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4863
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition: avxintrin.h:4370
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition: avxintrin.h:4215
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition: avxintrin.h:3583
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition: avxintrin.h:890
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition: avxintrin.h:3639
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:373
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: avxintrin.h:4078
#define __DEFAULT_FN_ATTRS
Definition: avxintrin.h:53
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition: avxintrin.h:3515
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4526
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into four signed truncated (rounded toward zero) 32-bit int...
Definition: avxintrin.h:2263
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition: avxintrin.h:3665
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition: avxintrin.h:3998
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition: avxintrin.h:2226
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition: avxintrin.h:4310
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition: avxintrin.h:4455
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2613
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition: avxintrin.h:286
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition: avxintrin.h:3692
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2847
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:3157
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:674
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition: avxintrin.h:4799
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into eight signed truncated (rounded toward zero) 32-bit integers re...
Definition: avxintrin.h:2303
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4548
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition: avxintrin.h:4950
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition: avxintrin.h:128
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition: avxintrin.h:3929
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3079
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition: avxintrin.h:244
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2789
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition: avxintrin.h:5005
static __inline float __DEFAULT_FN_ATTRS _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition: avxintrin.h:2352
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition: avxintrin.h:2378
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition: avxintrin.h:184
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2701
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition: avxintrin.h:5083
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition: avxintrin.h:339
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition: avxintrin.h:147
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition: avxintrin.h:721
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition: avxintrin.h:166
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2730
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition: avxintrin.h:304
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:560
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition: avxintrin.h:3539
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition: avxintrin.h:3491
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:638
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:3174
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2903
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4589
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: avxintrin.h:4196
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition: avxintrin.h:4404
static __inline double __DEFAULT_FN_ATTRS _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition: avxintrin.h:2319
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition: avxintrin.h:698
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition: avxintrin.h:767
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2759
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition: avxintrin.h:2176
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4929
#define __DEFAULT_FN_ATTRS128
Definition: avxintrin.h:56
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition: avxintrin.h:581
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: avxintrin.h:4030
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition: avxintrin.h:1433
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition: avxintrin.h:2448
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition: avxintrin.h:5112
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition: avxintrin.h:110
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition: avxintrin.h:4883
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:620
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2554
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition: avxintrin.h:4325
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition: avxintrin.h:2425
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition: avxintrin.h:3811
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition: avxintrin.h:5029
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition: avxintrin.h:797
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition: avxintrin.h:4352
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition: avxintrin.h:4387
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3466
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition: avxintrin.h:202
static __inline int __DEFAULT_FN_ATTRS _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition: avxintrin.h:2335
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition: avxintrin.h:3624
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4906
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:3230
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition: avxintrin.h:3348
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition: avxintrin.h:265
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition: avxintrin.h:4505
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition: avxintrin.h:3763
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4569
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2497
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition: avxintrin.h:3894
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:3194
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition: avxintrin.h:3731
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3035
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2671
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition: avxintrin.h:2993
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition: avxintrin.h:322
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2524
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition: avxintrin.h:836
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition: avxintrin.h:223
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2642
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition: avxintrin.h:4271
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:3251
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:542
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition: avxintrin.h:3289
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition: avxintrin.h:5054
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition: avxintrin.h:4253
double __v4df __attribute__((__vector_size__(32)))
Definition: avxintrin.h:17
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2583
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2818
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4607
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition: avxintrin.h:74
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition: avxintrin.h:3563
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:3141
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition: avxintrin.h:602
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition: avxintrin.h:3958
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1609
static __inline__ void int __a
Definition: emmintrin.h:4057
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3440
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1857
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1970
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3858
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3889
struct __storeu_i16 *__P __v
Definition: immintrin.h:480
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2018
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:2098
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1855