clang 22.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
55 __min_vector_width__(256)))
56#define __DEFAULT_FN_ATTRS128 \
57 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
58 __min_vector_width__(128)))
59
60#if defined(__cplusplus) && (__cplusplus >= 201103L)
61#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
62#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
63#else
64#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
65#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
66#endif
67
68/* Arithmetic */
69/// Adds two 256-bit vectors of [4 x double].
70///
71/// \headerfile <x86intrin.h>
72///
73/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
74///
75/// \param __a
76/// A 256-bit vector of [4 x double] containing one of the source operands.
77/// \param __b
78/// A 256-bit vector of [4 x double] containing one of the source operands.
79/// \returns A 256-bit vector of [4 x double] containing the sums of both
80/// operands.
81static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
82_mm256_add_pd(__m256d __a, __m256d __b) {
83 return (__m256d)((__v4df)__a+(__v4df)__b);
84}
85
86/// Adds two 256-bit vectors of [8 x float].
87///
88/// \headerfile <x86intrin.h>
89///
90/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
91///
92/// \param __a
93/// A 256-bit vector of [8 x float] containing one of the source operands.
94/// \param __b
95/// A 256-bit vector of [8 x float] containing one of the source operands.
96/// \returns A 256-bit vector of [8 x float] containing the sums of both
97/// operands.
98static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a,
99 __m256 __b) {
100 return (__m256)((__v8sf)__a+(__v8sf)__b);
101}
102
103/// Subtracts two 256-bit vectors of [4 x double].
104///
105/// \headerfile <x86intrin.h>
106///
107/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
108///
109/// \param __a
110/// A 256-bit vector of [4 x double] containing the minuend.
111/// \param __b
112/// A 256-bit vector of [4 x double] containing the subtrahend.
113/// \returns A 256-bit vector of [4 x double] containing the differences between
114/// both operands.
115static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
116_mm256_sub_pd(__m256d __a, __m256d __b) {
117 return (__m256d)((__v4df)__a-(__v4df)__b);
118}
119
120/// Subtracts two 256-bit vectors of [8 x float].
121///
122/// \headerfile <x86intrin.h>
123///
124/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
125///
126/// \param __a
127/// A 256-bit vector of [8 x float] containing the minuend.
128/// \param __b
129/// A 256-bit vector of [8 x float] containing the subtrahend.
130/// \returns A 256-bit vector of [8 x float] containing the differences between
131/// both operands.
132static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a,
133 __m256 __b) {
134 return (__m256)((__v8sf)__a-(__v8sf)__b);
135}
136
137/// Adds the even-indexed values and subtracts the odd-indexed values of
138/// two 256-bit vectors of [4 x double].
139///
140/// \headerfile <x86intrin.h>
141///
142/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
143///
144/// \param __a
145/// A 256-bit vector of [4 x double] containing the left source operand.
146/// \param __b
147/// A 256-bit vector of [4 x double] containing the right source operand.
148/// \returns A 256-bit vector of [4 x double] containing the alternating sums
149/// and differences between both operands.
150static __inline __m256d __DEFAULT_FN_ATTRS
151_mm256_addsub_pd(__m256d __a, __m256d __b)
152{
153 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
154}
155
156/// Adds the even-indexed values and subtracts the odd-indexed values of
157/// two 256-bit vectors of [8 x float].
158///
159/// \headerfile <x86intrin.h>
160///
161/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
162///
163/// \param __a
164/// A 256-bit vector of [8 x float] containing the left source operand.
165/// \param __b
166/// A 256-bit vector of [8 x float] containing the right source operand.
167/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
168/// differences between both operands.
169static __inline __m256 __DEFAULT_FN_ATTRS
170_mm256_addsub_ps(__m256 __a, __m256 __b)
171{
172 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
173}
174
175/// Divides two 256-bit vectors of [4 x double].
176///
177/// \headerfile <x86intrin.h>
178///
179/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
180///
181/// \param __a
182/// A 256-bit vector of [4 x double] containing the dividend.
183/// \param __b
184/// A 256-bit vector of [4 x double] containing the divisor.
185/// \returns A 256-bit vector of [4 x double] containing the quotients of both
186/// operands.
187static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
188_mm256_div_pd(__m256d __a, __m256d __b) {
189 return (__m256d)((__v4df)__a/(__v4df)__b);
190}
191
192/// Divides two 256-bit vectors of [8 x float].
193///
194/// \headerfile <x86intrin.h>
195///
196/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
197///
198/// \param __a
199/// A 256-bit vector of [8 x float] containing the dividend.
200/// \param __b
201/// A 256-bit vector of [8 x float] containing the divisor.
202/// \returns A 256-bit vector of [8 x float] containing the quotients of both
203/// operands.
204static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a,
205 __m256 __b) {
206 return (__m256)((__v8sf)__a/(__v8sf)__b);
207}
208
209/// Compares two 256-bit vectors of [4 x double] and returns the greater
210/// of each pair of values.
211///
212/// If either value in a comparison is NaN, returns the value from \a __b.
213///
214/// \headerfile <x86intrin.h>
215///
216/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
217///
218/// \param __a
219/// A 256-bit vector of [4 x double] containing one of the operands.
220/// \param __b
221/// A 256-bit vector of [4 x double] containing one of the operands.
222/// \returns A 256-bit vector of [4 x double] containing the maximum values
223/// between both operands.
224static __inline __m256d __DEFAULT_FN_ATTRS
225_mm256_max_pd(__m256d __a, __m256d __b)
226{
227 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
228}
229
230/// Compares two 256-bit vectors of [8 x float] and returns the greater
231/// of each pair of values.
232///
233/// If either value in a comparison is NaN, returns the value from \a __b.
234///
235/// \headerfile <x86intrin.h>
236///
237/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
238///
239/// \param __a
240/// A 256-bit vector of [8 x float] containing one of the operands.
241/// \param __b
242/// A 256-bit vector of [8 x float] containing one of the operands.
243/// \returns A 256-bit vector of [8 x float] containing the maximum values
244/// between both operands.
245static __inline __m256 __DEFAULT_FN_ATTRS
246_mm256_max_ps(__m256 __a, __m256 __b)
247{
248 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
249}
250
251/// Compares two 256-bit vectors of [4 x double] and returns the lesser
252/// of each pair of values.
253///
254/// If either value in a comparison is NaN, returns the value from \a __b.
255///
256/// \headerfile <x86intrin.h>
257///
258/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
259///
260/// \param __a
261/// A 256-bit vector of [4 x double] containing one of the operands.
262/// \param __b
263/// A 256-bit vector of [4 x double] containing one of the operands.
264/// \returns A 256-bit vector of [4 x double] containing the minimum values
265/// between both operands.
266static __inline __m256d __DEFAULT_FN_ATTRS
267_mm256_min_pd(__m256d __a, __m256d __b)
268{
269 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
270}
271
272/// Compares two 256-bit vectors of [8 x float] and returns the lesser
273/// of each pair of values.
274///
275/// If either value in a comparison is NaN, returns the value from \a __b.
276///
277/// \headerfile <x86intrin.h>
278///
279/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
280///
281/// \param __a
282/// A 256-bit vector of [8 x float] containing one of the operands.
283/// \param __b
284/// A 256-bit vector of [8 x float] containing one of the operands.
285/// \returns A 256-bit vector of [8 x float] containing the minimum values
286/// between both operands.
287static __inline __m256 __DEFAULT_FN_ATTRS
288_mm256_min_ps(__m256 __a, __m256 __b)
289{
290 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
291}
292
293/// Multiplies two 256-bit vectors of [4 x double].
294///
295/// \headerfile <x86intrin.h>
296///
297/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
298///
299/// \param __a
300/// A 256-bit vector of [4 x double] containing one of the operands.
301/// \param __b
302/// A 256-bit vector of [4 x double] containing one of the operands.
303/// \returns A 256-bit vector of [4 x double] containing the products of both
304/// operands.
305static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
306_mm256_mul_pd(__m256d __a, __m256d __b) {
307 return (__m256d)((__v4df)__a * (__v4df)__b);
308}
309
310/// Multiplies two 256-bit vectors of [8 x float].
311///
312/// \headerfile <x86intrin.h>
313///
314/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
315///
316/// \param __a
317/// A 256-bit vector of [8 x float] containing one of the operands.
318/// \param __b
319/// A 256-bit vector of [8 x float] containing one of the operands.
320/// \returns A 256-bit vector of [8 x float] containing the products of both
321/// operands.
322static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
323 __m256 __b) {
324 return (__m256)((__v8sf)__a * (__v8sf)__b);
325}
326
327/// Calculates the square roots of the values in a 256-bit vector of
328/// [4 x double].
329///
330/// \headerfile <x86intrin.h>
331///
332/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
333///
334/// \param __a
335/// A 256-bit vector of [4 x double].
336/// \returns A 256-bit vector of [4 x double] containing the square roots of the
337/// values in the operand.
338static __inline __m256d __DEFAULT_FN_ATTRS
340{
341 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
342}
343
344/// Calculates the square roots of the values in a 256-bit vector of
345/// [8 x float].
346///
347/// \headerfile <x86intrin.h>
348///
349/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
350///
351/// \param __a
352/// A 256-bit vector of [8 x float].
353/// \returns A 256-bit vector of [8 x float] containing the square roots of the
354/// values in the operand.
355static __inline __m256 __DEFAULT_FN_ATTRS
357{
358 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
359}
360
361/// Calculates the reciprocal square roots of the values in a 256-bit
362/// vector of [8 x float].
363///
364/// \headerfile <x86intrin.h>
365///
366/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
367///
368/// \param __a
369/// A 256-bit vector of [8 x float].
370/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
371/// roots of the values in the operand.
372static __inline __m256 __DEFAULT_FN_ATTRS
374{
375 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
376}
377
378/// Calculates the reciprocals of the values in a 256-bit vector of
379/// [8 x float].
380///
381/// \headerfile <x86intrin.h>
382///
383/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
384///
385/// \param __a
386/// A 256-bit vector of [8 x float].
387/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
388/// values in the operand.
389static __inline __m256 __DEFAULT_FN_ATTRS
391{
392 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
393}
394
395/// Rounds the values in a 256-bit vector of [4 x double] as specified
396/// by the byte operand. The source values are rounded to integer values and
397/// returned as 64-bit double-precision floating-point values.
398///
399/// \headerfile <x86intrin.h>
400///
401/// \code
402/// __m256d _mm256_round_pd(__m256d V, const int M);
403/// \endcode
404///
405/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
406///
407/// \param V
408/// A 256-bit vector of [4 x double].
409/// \param M
410/// An integer value that specifies the rounding operation. \n
411/// Bits [7:4] are reserved. \n
412/// Bit [3] is a precision exception value: \n
413/// 0: A normal PE exception is used. \n
414/// 1: The PE field is not updated. \n
415/// Bit [2] is the rounding control source: \n
416/// 0: Use bits [1:0] of \a M. \n
417/// 1: Use the current MXCSR setting. \n
418/// Bits [1:0] contain the rounding control definition: \n
419/// 00: Nearest. \n
420/// 01: Downward (toward negative infinity). \n
421/// 10: Upward (toward positive infinity). \n
422/// 11: Truncated.
423/// \returns A 256-bit vector of [4 x double] containing the rounded values.
424#define _mm256_round_pd(V, M) \
425 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
426
427/// Rounds the values stored in a 256-bit vector of [8 x float] as
428/// specified by the byte operand. The source values are rounded to integer
429/// values and returned as floating-point values.
430///
431/// \headerfile <x86intrin.h>
432///
433/// \code
434/// __m256 _mm256_round_ps(__m256 V, const int M);
435/// \endcode
436///
437/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
438///
439/// \param V
440/// A 256-bit vector of [8 x float].
441/// \param M
442/// An integer value that specifies the rounding operation. \n
443/// Bits [7:4] are reserved. \n
444/// Bit [3] is a precision exception value: \n
445/// 0: A normal PE exception is used. \n
446/// 1: The PE field is not updated. \n
447/// Bit [2] is the rounding control source: \n
448/// 0: Use bits [1:0] of \a M. \n
449/// 1: Use the current MXCSR setting. \n
450/// Bits [1:0] contain the rounding control definition: \n
451/// 00: Nearest. \n
452/// 01: Downward (toward negative infinity). \n
453/// 10: Upward (toward positive infinity). \n
454/// 11: Truncated.
455/// \returns A 256-bit vector of [8 x float] containing the rounded values.
456#define _mm256_round_ps(V, M) \
457 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
458
459/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
460/// source values are rounded up to integer values and returned as 64-bit
461/// double-precision floating-point values.
462///
463/// \headerfile <x86intrin.h>
464///
465/// \code
466/// __m256d _mm256_ceil_pd(__m256d V);
467/// \endcode
468///
469/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
470///
471/// \param V
472/// A 256-bit vector of [4 x double].
473/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
474#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
475
476/// Rounds down the values stored in a 256-bit vector of [4 x double].
477/// The source values are rounded down to integer values and returned as
478/// 64-bit double-precision floating-point values.
479///
480/// \headerfile <x86intrin.h>
481///
482/// \code
483/// __m256d _mm256_floor_pd(__m256d V);
484/// \endcode
485///
486/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
487///
488/// \param V
489/// A 256-bit vector of [4 x double].
490/// \returns A 256-bit vector of [4 x double] containing the rounded down
491/// values.
492#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
493
494/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
495/// source values are rounded up to integer values and returned as
496/// floating-point values.
497///
498/// \headerfile <x86intrin.h>
499///
500/// \code
501/// __m256 _mm256_ceil_ps(__m256 V);
502/// \endcode
503///
504/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
505///
506/// \param V
507/// A 256-bit vector of [8 x float].
508/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
509#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
510
511/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
512/// source values are rounded down to integer values and returned as
513/// floating-point values.
514///
515/// \headerfile <x86intrin.h>
516///
517/// \code
518/// __m256 _mm256_floor_ps(__m256 V);
519/// \endcode
520///
521/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
522///
523/// \param V
524/// A 256-bit vector of [8 x float].
525/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
526#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
527
528/* Logical */
529/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
530///
531/// \headerfile <x86intrin.h>
532///
533/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
534///
535/// \param __a
536/// A 256-bit vector of [4 x double] containing one of the source operands.
537/// \param __b
538/// A 256-bit vector of [4 x double] containing one of the source operands.
539/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
540/// values between both operands.
541static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
542_mm256_and_pd(__m256d __a, __m256d __b)
543{
544 return (__m256d)((__v4du)__a & (__v4du)__b);
545}
546
547/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
548///
549/// \headerfile <x86intrin.h>
550///
551/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
552///
553/// \param __a
554/// A 256-bit vector of [8 x float] containing one of the source operands.
555/// \param __b
556/// A 256-bit vector of [8 x float] containing one of the source operands.
557/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
558/// values between both operands.
559static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
560_mm256_and_ps(__m256 __a, __m256 __b)
561{
562 return (__m256)((__v8su)__a & (__v8su)__b);
563}
564
565/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
566/// the one's complement of the values contained in the first source operand.
567///
568/// \headerfile <x86intrin.h>
569///
570/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
571///
572/// \param __a
573/// A 256-bit vector of [4 x double] containing the left source operand. The
574/// one's complement of this value is used in the bitwise AND.
575/// \param __b
576/// A 256-bit vector of [4 x double] containing the right source operand.
577/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
578/// values of the second operand and the one's complement of the first
579/// operand.
580static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
581_mm256_andnot_pd(__m256d __a, __m256d __b)
582{
583 return (__m256d)(~(__v4du)__a & (__v4du)__b);
584}
585
586/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
587/// the one's complement of the values contained in the first source operand.
588///
589/// \headerfile <x86intrin.h>
590///
591/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
592///
593/// \param __a
594/// A 256-bit vector of [8 x float] containing the left source operand. The
595/// one's complement of this value is used in the bitwise AND.
596/// \param __b
597/// A 256-bit vector of [8 x float] containing the right source operand.
598/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
599/// values of the second operand and the one's complement of the first
600/// operand.
601static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
602_mm256_andnot_ps(__m256 __a, __m256 __b)
603{
604 return (__m256)(~(__v8su)__a & (__v8su)__b);
605}
606
607/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
608///
609/// \headerfile <x86intrin.h>
610///
611/// This intrinsic corresponds to the <c> VORPD </c> instruction.
612///
613/// \param __a
614/// A 256-bit vector of [4 x double] containing one of the source operands.
615/// \param __b
616/// A 256-bit vector of [4 x double] containing one of the source operands.
617/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
618/// values between both operands.
619static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
620_mm256_or_pd(__m256d __a, __m256d __b)
621{
622 return (__m256d)((__v4du)__a | (__v4du)__b);
623}
624
625/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
626///
627/// \headerfile <x86intrin.h>
628///
629/// This intrinsic corresponds to the <c> VORPS </c> instruction.
630///
631/// \param __a
632/// A 256-bit vector of [8 x float] containing one of the source operands.
633/// \param __b
634/// A 256-bit vector of [8 x float] containing one of the source operands.
635/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
636/// values between both operands.
637static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
638_mm256_or_ps(__m256 __a, __m256 __b)
639{
640 return (__m256)((__v8su)__a | (__v8su)__b);
641}
642
643/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
644///
645/// \headerfile <x86intrin.h>
646///
647/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
648///
649/// \param __a
650/// A 256-bit vector of [4 x double] containing one of the source operands.
651/// \param __b
652/// A 256-bit vector of [4 x double] containing one of the source operands.
653/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
654/// values between both operands.
655static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
656_mm256_xor_pd(__m256d __a, __m256d __b)
657{
658 return (__m256d)((__v4du)__a ^ (__v4du)__b);
659}
660
661/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
662///
663/// \headerfile <x86intrin.h>
664///
665/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
666///
667/// \param __a
668/// A 256-bit vector of [8 x float] containing one of the source operands.
669/// \param __b
670/// A 256-bit vector of [8 x float] containing one of the source operands.
671/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
672/// values between both operands.
673static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
674_mm256_xor_ps(__m256 __a, __m256 __b)
675{
676 return (__m256)((__v8su)__a ^ (__v8su)__b);
677}
678
679/* Horizontal arithmetic */
680/// Horizontally adds the adjacent pairs of values contained in two
681/// 256-bit vectors of [4 x double].
682///
683/// \headerfile <x86intrin.h>
684///
685/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
686///
687/// \param __a
688/// A 256-bit vector of [4 x double] containing one of the source operands.
689/// The horizontal sums of the values are returned in the even-indexed
690/// elements of a vector of [4 x double].
691/// \param __b
692/// A 256-bit vector of [4 x double] containing one of the source operands.
693/// The horizontal sums of the values are returned in the odd-indexed
694/// elements of a vector of [4 x double].
695/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
696/// both operands.
697static __inline __m256d __DEFAULT_FN_ATTRS
698_mm256_hadd_pd(__m256d __a, __m256d __b)
699{
700 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
701}
702
703/// Horizontally adds the adjacent pairs of values contained in two
704/// 256-bit vectors of [8 x float].
705///
706/// \headerfile <x86intrin.h>
707///
708/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
709///
710/// \param __a
711/// A 256-bit vector of [8 x float] containing one of the source operands.
712/// The horizontal sums of the values are returned in the elements with
713/// index 0, 1, 4, 5 of a vector of [8 x float].
714/// \param __b
715/// A 256-bit vector of [8 x float] containing one of the source operands.
716/// The horizontal sums of the values are returned in the elements with
717/// index 2, 3, 6, 7 of a vector of [8 x float].
718/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
719/// both operands.
720static __inline __m256 __DEFAULT_FN_ATTRS
721_mm256_hadd_ps(__m256 __a, __m256 __b)
722{
723 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
724}
725
726/// Horizontally subtracts the adjacent pairs of values contained in two
727/// 256-bit vectors of [4 x double].
728///
729/// \headerfile <x86intrin.h>
730///
731/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
732///
733/// \param __a
734/// A 256-bit vector of [4 x double] containing one of the source operands.
735/// The horizontal differences between the values are returned in the
736/// even-indexed elements of a vector of [4 x double].
737/// \param __b
738/// A 256-bit vector of [4 x double] containing one of the source operands.
739/// The horizontal differences between the values are returned in the
740/// odd-indexed elements of a vector of [4 x double].
741/// \returns A 256-bit vector of [4 x double] containing the horizontal
742/// differences of both operands.
743static __inline __m256d __DEFAULT_FN_ATTRS
744_mm256_hsub_pd(__m256d __a, __m256d __b)
745{
746 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
747}
748
749/// Horizontally subtracts the adjacent pairs of values contained in two
750/// 256-bit vectors of [8 x float].
751///
752/// \headerfile <x86intrin.h>
753///
754/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
755///
756/// \param __a
757/// A 256-bit vector of [8 x float] containing one of the source operands.
758/// The horizontal differences between the values are returned in the
759/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
760/// \param __b
761/// A 256-bit vector of [8 x float] containing one of the source operands.
762/// The horizontal differences between the values are returned in the
763/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
764/// \returns A 256-bit vector of [8 x float] containing the horizontal
765/// differences of both operands.
766static __inline __m256 __DEFAULT_FN_ATTRS
767_mm256_hsub_ps(__m256 __a, __m256 __b)
768{
769 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
770}
771
772/* Vector permutations */
773/// Copies the values in a 128-bit vector of [2 x double] as specified
774/// by the 128-bit integer vector operand.
775///
776/// \headerfile <x86intrin.h>
777///
778/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
779///
780/// \param __a
781/// A 128-bit vector of [2 x double].
782/// \param __c
783/// A 128-bit integer vector operand specifying how the values are to be
784/// copied. \n
785/// Bit [1]: \n
786/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
787/// vector. \n
788/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
789/// returned vector. \n
790/// Bit [65]: \n
791/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
792/// returned vector. \n
793/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
794/// returned vector.
795/// \returns A 128-bit vector of [2 x double] containing the copied values.
796static __inline __m128d __DEFAULT_FN_ATTRS128
797_mm_permutevar_pd(__m128d __a, __m128i __c)
798{
799 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
800}
801
802/// Copies the values in a 256-bit vector of [4 x double] as specified
803/// by the 256-bit integer vector operand.
804///
805/// \headerfile <x86intrin.h>
806///
807/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
808///
809/// \param __a
810/// A 256-bit vector of [4 x double].
811/// \param __c
812/// A 256-bit integer vector operand specifying how the values are to be
813/// copied. \n
814/// Bit [1]: \n
815/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
816/// vector. \n
817/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
818/// returned vector. \n
819/// Bit [65]: \n
820/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
821/// returned vector. \n
822/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
823/// returned vector. \n
824/// Bit [129]: \n
825/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
826/// returned vector. \n
827/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
828/// returned vector. \n
829/// Bit [193]: \n
830/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
831/// returned vector. \n
832/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
833/// returned vector.
834/// \returns A 256-bit vector of [4 x double] containing the copied values.
835static __inline __m256d __DEFAULT_FN_ATTRS
836_mm256_permutevar_pd(__m256d __a, __m256i __c)
837{
838 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
839}
840
841/// Copies the values stored in a 128-bit vector of [4 x float] as
842/// specified by the 128-bit integer vector operand.
843///
844/// \headerfile <x86intrin.h>
845///
846/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
847///
848/// \param __a
849/// A 128-bit vector of [4 x float].
850/// \param __c
851/// A 128-bit integer vector operand specifying how the values are to be
852/// copied. \n
853/// Bits [1:0]: \n
854/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
855/// returned vector. \n
856/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
857/// returned vector. \n
858/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
859/// returned vector. \n
860/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
861/// returned vector. \n
862/// Bits [33:32]: \n
863/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
864/// returned vector. \n
865/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
866/// returned vector. \n
867/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
868/// returned vector. \n
869/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
870/// returned vector. \n
871/// Bits [65:64]: \n
872/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
873/// returned vector. \n
874/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
875/// returned vector. \n
876/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
877/// returned vector. \n
878/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
879/// returned vector. \n
880/// Bits [97:96]: \n
881/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
882/// returned vector. \n
883/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
884/// returned vector. \n
885/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
886/// returned vector. \n
887/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
888/// returned vector.
889/// \returns A 128-bit vector of [4 x float] containing the copied values.
890static __inline __m128 __DEFAULT_FN_ATTRS128
891_mm_permutevar_ps(__m128 __a, __m128i __c)
892{
893 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
894}
895
896/// Copies the values stored in a 256-bit vector of [8 x float] as
897/// specified by the 256-bit integer vector operand.
898///
899/// \headerfile <x86intrin.h>
900///
901/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
902///
903/// \param __a
904/// A 256-bit vector of [8 x float].
905/// \param __c
906/// A 256-bit integer vector operand specifying how the values are to be
907/// copied. \n
908/// Bits [1:0]: \n
909/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
910/// returned vector. \n
911/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
912/// returned vector. \n
913/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
914/// returned vector. \n
915/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
916/// returned vector. \n
917/// Bits [33:32]: \n
918/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
919/// returned vector. \n
920/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
921/// returned vector. \n
922/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
923/// returned vector. \n
924/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
925/// returned vector. \n
926/// Bits [65:64]: \n
927/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
928/// returned vector. \n
929/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
930/// returned vector. \n
931/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
932/// returned vector. \n
933/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
934/// returned vector. \n
935/// Bits [97:96]: \n
936/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
937/// returned vector. \n
938/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
939/// returned vector. \n
940/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
941/// returned vector. \n
942/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
943/// returned vector. \n
944/// Bits [129:128]: \n
945/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
946/// returned vector. \n
947/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
948/// returned vector. \n
949/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
950/// returned vector. \n
951/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
952/// returned vector. \n
953/// Bits [161:160]: \n
954/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
955/// returned vector. \n
956/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
957/// returned vector. \n
958/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
959/// returned vector. \n
960/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
961/// returned vector. \n
962/// Bits [193:192]: \n
963/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
964/// returned vector. \n
965/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
966/// returned vector. \n
967/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
968/// returned vector. \n
969/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
970/// returned vector. \n
971/// Bits [225:224]: \n
972/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
973/// returned vector. \n
974/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
975/// returned vector. \n
976/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
977/// returned vector. \n
978/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
979/// returned vector.
980/// \returns A 256-bit vector of [8 x float] containing the copied values.
981static __inline __m256 __DEFAULT_FN_ATTRS
983{
984 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
985}
986
987/// Copies the values in a 128-bit vector of [2 x double] as specified
988/// by the immediate integer operand.
989///
990/// \headerfile <x86intrin.h>
991///
992/// \code
993/// __m128d _mm_permute_pd(__m128d A, const int C);
994/// \endcode
995///
996/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
997///
998/// \param A
999/// A 128-bit vector of [2 x double].
1000/// \param C
1001/// An immediate integer operand specifying how the values are to be
1002/// copied. \n
1003/// Bit [0]: \n
1004/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1005/// vector. \n
1006/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1007/// returned vector. \n
1008/// Bit [1]: \n
1009/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1010/// returned vector. \n
1011/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1012/// returned vector.
1013/// \returns A 128-bit vector of [2 x double] containing the copied values.
1014#define _mm_permute_pd(A, C) \
1015 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1016
1017/// Copies the values in a 256-bit vector of [4 x double] as specified by
1018/// the immediate integer operand.
1019///
1020/// \headerfile <x86intrin.h>
1021///
1022/// \code
1023/// __m256d _mm256_permute_pd(__m256d A, const int C);
1024/// \endcode
1025///
1026/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1027///
1028/// \param A
1029/// A 256-bit vector of [4 x double].
1030/// \param C
1031/// An immediate integer operand specifying how the values are to be
1032/// copied. \n
1033/// Bit [0]: \n
1034/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1035/// vector. \n
1036/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1037/// returned vector. \n
1038/// Bit [1]: \n
1039/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1040/// returned vector. \n
1041/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1042/// returned vector. \n
1043/// Bit [2]: \n
1044/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1045/// returned vector. \n
1046/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1047/// returned vector. \n
1048/// Bit [3]: \n
1049/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1050/// returned vector. \n
1051/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1052/// returned vector.
1053/// \returns A 256-bit vector of [4 x double] containing the copied values.
1054#define _mm256_permute_pd(A, C) \
1055 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1056
1057/// Copies the values in a 128-bit vector of [4 x float] as specified by
1058/// the immediate integer operand.
1059///
1060/// \headerfile <x86intrin.h>
1061///
1062/// \code
1063/// __m128 _mm_permute_ps(__m128 A, const int C);
1064/// \endcode
1065///
1066/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1067///
1068/// \param A
1069/// A 128-bit vector of [4 x float].
1070/// \param C
1071/// An immediate integer operand specifying how the values are to be
1072/// copied. \n
1073/// Bits [1:0]: \n
1074/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1075/// returned vector. \n
1076/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1077/// returned vector. \n
1078/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1079/// returned vector. \n
1080/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1081/// returned vector. \n
1082/// Bits [3:2]: \n
1083/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1084/// returned vector. \n
1085/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1086/// returned vector. \n
1087/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1088/// returned vector. \n
1089/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1090/// returned vector. \n
1091/// Bits [5:4]: \n
1092/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1093/// returned vector. \n
1094/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1095/// returned vector. \n
1096/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1097/// returned vector. \n
1098/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1099/// returned vector. \n
1100/// Bits [7:6]: \n
1101/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1102/// returned vector. \n
1103/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1104/// returned vector. \n
1105/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1106/// returned vector. \n
1107/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1108/// returned vector.
1109/// \returns A 128-bit vector of [4 x float] containing the copied values.
1110#define _mm_permute_ps(A, C) \
1111 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1112
1113/// Copies the values in a 256-bit vector of [8 x float] as specified by
1114/// the immediate integer operand.
1115///
1116/// \headerfile <x86intrin.h>
1117///
1118/// \code
1119/// __m256 _mm256_permute_ps(__m256 A, const int C);
1120/// \endcode
1121///
1122/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1123///
1124/// \param A
1125/// A 256-bit vector of [8 x float].
1126/// \param C
1127/// An immediate integer operand specifying how the values are to be
1128/// copied. \n
1129/// Bits [1:0]: \n
1130/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1131/// returned vector. \n
1132/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1133/// returned vector. \n
1134/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1135/// returned vector. \n
1136/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1137/// returned vector. \n
1138/// Bits [3:2]: \n
1139/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1140/// returned vector. \n
1141/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1142/// returned vector. \n
1143/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1144/// returned vector. \n
1145/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1146/// returned vector. \n
1147/// Bits [5:4]: \n
1148/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1149/// returned vector. \n
1150/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1151/// returned vector. \n
1152/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1153/// returned vector. \n
1154/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1155/// returned vector. \n
1156/// Bits [7:6]: \n
1157/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1158/// returned vector. \n
1159/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1160/// returned vector. \n
1161/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1162/// returned vector. \n
1163/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1164/// returned vector. \n
1165/// Bits [1:0]: \n
1166/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1167/// returned vector. \n
1168/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1169/// returned vector. \n
1170/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1171/// returned vector. \n
1172/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1173/// returned vector. \n
1174/// Bits [3:2]: \n
1175/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1176/// returned vector. \n
1177/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1178/// returned vector. \n
1179/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1180/// returned vector. \n
1181/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1182/// returned vector. \n
1183/// Bits [5:4]: \n
1184/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1185/// returned vector. \n
1186/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1187/// returned vector. \n
1188/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1189/// returned vector. \n
1190/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1191/// returned vector. \n
1192/// Bits [7:6]: \n
1193/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1194/// returned vector. \n
1195/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1196/// returned vector. \n
1197/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1198/// returned vector. \n
1199/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1200/// returned vector.
1201/// \returns A 256-bit vector of [8 x float] containing the copied values.
1202#define _mm256_permute_ps(A, C) \
1203 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1204
1205/// Permutes 128-bit data values stored in two 256-bit vectors of
1206/// [4 x double], as specified by the immediate integer operand.
1207///
1208/// \headerfile <x86intrin.h>
1209///
1210/// \code
1211/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1212/// \endcode
1213///
1214/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1215///
1216/// \param V1
1217/// A 256-bit vector of [4 x double].
1218/// \param V2
1219/// A 256-bit vector of [4 x double.
1220/// \param M
1221/// An immediate integer operand specifying how the values are to be
1222/// permuted. \n
1223/// Bits [1:0]: \n
1224/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1225/// destination. \n
1226/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1227/// destination. \n
1228/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1229/// destination. \n
1230/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1231/// destination. \n
1232/// Bits [5:4]: \n
1233/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1234/// destination. \n
1235/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1236/// destination. \n
1237/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1238/// destination. \n
1239/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1240/// destination.
1241/// \returns A 256-bit vector of [4 x double] containing the copied values.
1242#define _mm256_permute2f128_pd(V1, V2, M) \
1243 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1244 (__v4df)(__m256d)(V2), (int)(M)))
1245
1246/// Permutes 128-bit data values stored in two 256-bit vectors of
1247/// [8 x float], as specified by the immediate integer operand.
1248///
1249/// \headerfile <x86intrin.h>
1250///
1251/// \code
1252/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1253/// \endcode
1254///
1255/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1256///
1257/// \param V1
1258/// A 256-bit vector of [8 x float].
1259/// \param V2
1260/// A 256-bit vector of [8 x float].
1261/// \param M
1262/// An immediate integer operand specifying how the values are to be
1263/// permuted. \n
1264/// Bits [1:0]: \n
1265/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1266/// destination. \n
1267/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1268/// destination. \n
1269/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1270/// destination. \n
1271/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1272/// destination. \n
1273/// Bits [5:4]: \n
1274/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1275/// destination. \n
1276/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1277/// destination. \n
1278/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1279/// destination. \n
1280/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1281/// destination.
1282/// \returns A 256-bit vector of [8 x float] containing the copied values.
1283#define _mm256_permute2f128_ps(V1, V2, M) \
1284 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1285 (__v8sf)(__m256)(V2), (int)(M)))
1286
1287/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1288/// as specified by the immediate integer operand.
1289///
1290/// \headerfile <x86intrin.h>
1291///
1292/// \code
1293/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1294/// \endcode
1295///
1296/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1297///
1298/// \param V1
1299/// A 256-bit integer vector.
1300/// \param V2
1301/// A 256-bit integer vector.
1302/// \param M
1303/// An immediate integer operand specifying how the values are to be copied.
1304/// Bits [1:0]: \n
1305/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1306/// destination. \n
1307/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1308/// destination. \n
1309/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1310/// destination. \n
1311/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1312/// destination. \n
1313/// Bits [5:4]: \n
1314/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1315/// destination. \n
1316/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1317/// destination. \n
1318/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1319/// destination. \n
1320/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1321/// destination.
1322/// \returns A 256-bit integer vector containing the copied values.
1323#define _mm256_permute2f128_si256(V1, V2, M) \
1324 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1325 (__v8si)(__m256i)(V2), (int)(M)))
1326
1327/* Vector Blend */
1328/// Merges 64-bit double-precision data values stored in either of the
1329/// two 256-bit vectors of [4 x double], as specified by the immediate
1330/// integer operand.
1331///
1332/// \headerfile <x86intrin.h>
1333///
1334/// \code
1335/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1336/// \endcode
1337///
1338/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1339///
1340/// \param V1
1341/// A 256-bit vector of [4 x double].
1342/// \param V2
1343/// A 256-bit vector of [4 x double].
1344/// \param M
1345/// An immediate integer operand, with mask bits [3:0] specifying how the
1346/// values are to be copied. The position of the mask bit corresponds to the
1347/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1348/// element in operand \a V1 is copied to the same position in the
1349/// destination. When a mask bit is 1, the corresponding 64-bit element in
1350/// operand \a V2 is copied to the same position in the destination.
1351/// \returns A 256-bit vector of [4 x double] containing the copied values.
1352#define _mm256_blend_pd(V1, V2, M) \
1353 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1354 (__v4df)(__m256d)(V2), (int)(M)))
1355
1356/// Merges 32-bit single-precision data values stored in either of the
1357/// two 256-bit vectors of [8 x float], as specified by the immediate
1358/// integer operand.
1359///
1360/// \headerfile <x86intrin.h>
1361///
1362/// \code
1363/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1364/// \endcode
1365///
1366/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1367///
1368/// \param V1
1369/// A 256-bit vector of [8 x float].
1370/// \param V2
1371/// A 256-bit vector of [8 x float].
1372/// \param M
1373/// An immediate integer operand, with mask bits [7:0] specifying how the
1374/// values are to be copied. The position of the mask bit corresponds to the
1375/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1376/// element in operand \a V1 is copied to the same position in the
1377/// destination. When a mask bit is 1, the corresponding 32-bit element in
1378/// operand \a V2 is copied to the same position in the destination.
1379/// \returns A 256-bit vector of [8 x float] containing the copied values.
1380#define _mm256_blend_ps(V1, V2, M) \
1381 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1382 (__v8sf)(__m256)(V2), (int)(M)))
1383
1384/// Merges 64-bit double-precision data values stored in either of the
1385/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1386/// operand.
1387///
1388/// \headerfile <x86intrin.h>
1389///
1390/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1391///
1392/// \param __a
1393/// A 256-bit vector of [4 x double].
1394/// \param __b
1395/// A 256-bit vector of [4 x double].
1396/// \param __c
1397/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1398/// how the values are to be copied. The position of the mask bit corresponds
1399/// to the most significant bit of a copied value. When a mask bit is 0, the
1400/// corresponding 64-bit element in operand \a __a is copied to the same
1401/// position in the destination. When a mask bit is 1, the corresponding
1402/// 64-bit element in operand \a __b is copied to the same position in the
1403/// destination.
1404/// \returns A 256-bit vector of [4 x double] containing the copied values.
1405static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
1406_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {
1407 return (__m256d)__builtin_ia32_blendvpd256(
1408 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1409}
1410
1411/// Merges 32-bit single-precision data values stored in either of the
1412/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1413/// operand.
1414///
1415/// \headerfile <x86intrin.h>
1416///
1417/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1418///
1419/// \param __a
1420/// A 256-bit vector of [8 x float].
1421/// \param __b
1422/// A 256-bit vector of [8 x float].
1423/// \param __c
1424/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1425/// and 31 specifying how the values are to be copied. The position of the
1426/// mask bit corresponds to the most significant bit of a copied value. When
1427/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1428/// copied to the same position in the destination. When a mask bit is 1, the
1429/// corresponding 32-bit element in operand \a __b is copied to the same
1430/// position in the destination.
1431/// \returns A 256-bit vector of [8 x float] containing the copied values.
1432static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
1433_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {
1434 return (__m256)__builtin_ia32_blendvps256(
1435 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1436}
1437
1438/* Vector Dot Product */
1439/// Computes two dot products in parallel, using the lower and upper
1440/// halves of two [8 x float] vectors as input to the two computations, and
1441/// returning the two dot products in the lower and upper halves of the
1442/// [8 x float] result.
1443///
1444/// The immediate integer operand controls which input elements will
1445/// contribute to the dot product, and where the final results are returned.
1446/// In general, for each dot product, the four corresponding elements of the
1447/// input vectors are multiplied; the first two and second two products are
1448/// summed, then the two sums are added to form the final result.
1449///
1450/// \headerfile <x86intrin.h>
1451///
1452/// \code
1453/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1454/// \endcode
1455///
1456/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1457///
1458/// \param V1
1459/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1460/// \param V2
1461/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1462/// \param M
1463/// An immediate integer argument. Bits [7:4] determine which elements of
1464/// the input vectors are used, with bit [4] corresponding to the lowest
1465/// element and bit [7] corresponding to the highest element of each [4 x
1466/// float] subvector. If a bit is set, the corresponding elements from the
1467/// two input vectors are used as an input for dot product; otherwise that
1468/// input is treated as zero. Bits [3:0] determine which elements of the
1469/// result will receive a copy of the final dot product, with bit [0]
1470/// corresponding to the lowest element and bit [3] corresponding to the
1471/// highest element of each [4 x float] subvector. If a bit is set, the dot
1472/// product is returned in the corresponding element; otherwise that element
1473/// is set to zero. The bitmask is applied in the same way to each of the
1474/// two parallel dot product computations.
1475/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1476#define _mm256_dp_ps(V1, V2, M) \
1477 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1478 (__v8sf)(__m256)(V2), (M)))
1479
1480/* Vector shuffle */
1481/// Selects 8 float values from the 256-bit operands of [8 x float], as
1482/// specified by the immediate value operand.
1483///
1484/// The four selected elements in each operand are copied to the destination
1485/// according to the bits specified in the immediate operand. The selected
1486/// elements from the first 256-bit operand are copied to bits [63:0] and
1487/// bits [191:128] of the destination, and the selected elements from the
1488/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1489/// the destination. For example, if bits [7:0] of the immediate operand
1490/// contain a value of 0xFF, the 256-bit destination vector would contain the
1491/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1492///
1493/// \headerfile <x86intrin.h>
1494///
1495/// \code
1496/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1497/// \endcode
1498///
1499/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1500///
1501/// \param a
1502/// A 256-bit vector of [8 x float]. The four selected elements in this
1503/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1504/// according to the bits specified in the immediate operand.
1505/// \param b
1506/// A 256-bit vector of [8 x float]. The four selected elements in this
1507/// operand are copied to bits [127:64] and bits [255:192] in the
1508/// destination, according to the bits specified in the immediate operand.
1509/// \param mask
1510/// An immediate value containing an 8-bit value specifying which elements to
1511/// copy from \a a and \a b \n.
1512/// Bits [3:0] specify the values copied from operand \a a. \n
1513/// Bits [7:4] specify the values copied from operand \a b. \n
1514/// The destinations within the 256-bit destination are assigned values as
1515/// follows, according to the bit value assignments described below: \n
1516/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1517/// destination. \n
1518/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1519/// destination. \n
1520/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1521/// destination. \n
1522/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1523/// the destination. \n
1524/// Bit value assignments: \n
1525/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1526/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1527/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1528/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1529/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1530/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1531/// <c>[b6, b4, b2, b0]</c>.
1532/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1533#define _mm256_shuffle_ps(a, b, mask) \
1534 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1535 (__v8sf)(__m256)(b), (int)(mask)))
1536
1537/// Selects four double-precision values from the 256-bit operands of
1538/// [4 x double], as specified by the immediate value operand.
1539///
1540/// The selected elements from the first 256-bit operand are copied to bits
1541/// [63:0] and bits [191:128] in the destination, and the selected elements
1542/// from the second 256-bit operand are copied to bits [127:64] and bits
1543/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1544/// operand contain a value of 0xF, the 256-bit destination vector would
1545/// contain the following values: b[3], a[3], b[1], a[1].
1546///
1547/// \headerfile <x86intrin.h>
1548///
1549/// \code
1550/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1551/// \endcode
1552///
1553/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1554///
1555/// \param a
1556/// A 256-bit vector of [4 x double].
1557/// \param b
1558/// A 256-bit vector of [4 x double].
1559/// \param mask
1560/// An immediate value containing 8-bit values specifying which elements to
1561/// copy from \a a and \a b: \n
1562/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1563/// destination. \n
1564/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1565/// destination. \n
1566/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1567/// destination. \n
1568/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1569/// destination. \n
1570/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1571/// destination. \n
1572/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1573/// destination. \n
1574/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1575/// destination. \n
1576/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1577/// destination.
1578/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1579#define _mm256_shuffle_pd(a, b, mask) \
1580 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1581 (__v4df)(__m256d)(b), (int)(mask)))
1582
1583/* Compare */
1584#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1585#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1586#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1587#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1588#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1589#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1590#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1591#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1592#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1593#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1594#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1595#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1596#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1597#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1598#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1599#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1600#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1601#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1602#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1603#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1604#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1605#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1606#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1607#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1608
1609/* Below intrinsic defined in emmintrin.h can be used for AVX */
1610/// Compares each of the corresponding double-precision values of two
1611/// 128-bit vectors of [2 x double], using the operation specified by the
1612/// immediate integer operand.
1613///
1614/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1615/// If either value in a comparison is NaN, comparisons that are ordered
1616/// return false, and comparisons that are unordered return true.
1617///
1618/// \headerfile <x86intrin.h>
1619///
1620/// \code
1621/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1622/// \endcode
1623///
1624/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1625///
1626/// \param a
1627/// A 128-bit vector of [2 x double].
1628/// \param b
1629/// A 128-bit vector of [2 x double].
1630/// \param c
1631/// An immediate integer operand, with bits [4:0] specifying which comparison
1632/// operation to use: \n
1633/// 0x00: Equal (ordered, non-signaling) \n
1634/// 0x01: Less-than (ordered, signaling) \n
1635/// 0x02: Less-than-or-equal (ordered, signaling) \n
1636/// 0x03: Unordered (non-signaling) \n
1637/// 0x04: Not-equal (unordered, non-signaling) \n
1638/// 0x05: Not-less-than (unordered, signaling) \n
1639/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1640/// 0x07: Ordered (non-signaling) \n
1641/// 0x08: Equal (unordered, non-signaling) \n
1642/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1643/// 0x0A: Not-greater-than (unordered, signaling) \n
1644/// 0x0B: False (ordered, non-signaling) \n
1645/// 0x0C: Not-equal (ordered, non-signaling) \n
1646/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1647/// 0x0E: Greater-than (ordered, signaling) \n
1648/// 0x0F: True (unordered, non-signaling) \n
1649/// 0x10: Equal (ordered, signaling) \n
1650/// 0x11: Less-than (ordered, non-signaling) \n
1651/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1652/// 0x13: Unordered (signaling) \n
1653/// 0x14: Not-equal (unordered, signaling) \n
1654/// 0x15: Not-less-than (unordered, non-signaling) \n
1655/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1656/// 0x17: Ordered (signaling) \n
1657/// 0x18: Equal (unordered, signaling) \n
1658/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1659/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1660/// 0x1B: False (ordered, signaling) \n
1661/// 0x1C: Not-equal (ordered, signaling) \n
1662/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1663/// 0x1E: Greater-than (ordered, non-signaling) \n
1664/// 0x1F: True (unordered, signaling)
1665/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1666/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1667
1668/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1669/// Compares each of the corresponding values of two 128-bit vectors of
1670/// [4 x float], using the operation specified by the immediate integer
1671/// operand.
1672///
1673/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1674/// If either value in a comparison is NaN, comparisons that are ordered
1675/// return false, and comparisons that are unordered return true.
1676///
1677/// \headerfile <x86intrin.h>
1678///
1679/// \code
1680/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1681/// \endcode
1682///
1683/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1684///
1685/// \param a
1686/// A 128-bit vector of [4 x float].
1687/// \param b
1688/// A 128-bit vector of [4 x float].
1689/// \param c
1690/// An immediate integer operand, with bits [4:0] specifying which comparison
1691/// operation to use: \n
1692/// 0x00: Equal (ordered, non-signaling) \n
1693/// 0x01: Less-than (ordered, signaling) \n
1694/// 0x02: Less-than-or-equal (ordered, signaling) \n
1695/// 0x03: Unordered (non-signaling) \n
1696/// 0x04: Not-equal (unordered, non-signaling) \n
1697/// 0x05: Not-less-than (unordered, signaling) \n
1698/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1699/// 0x07: Ordered (non-signaling) \n
1700/// 0x08: Equal (unordered, non-signaling) \n
1701/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1702/// 0x0A: Not-greater-than (unordered, signaling) \n
1703/// 0x0B: False (ordered, non-signaling) \n
1704/// 0x0C: Not-equal (ordered, non-signaling) \n
1705/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1706/// 0x0E: Greater-than (ordered, signaling) \n
1707/// 0x0F: True (unordered, non-signaling) \n
1708/// 0x10: Equal (ordered, signaling) \n
1709/// 0x11: Less-than (ordered, non-signaling) \n
1710/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1711/// 0x13: Unordered (signaling) \n
1712/// 0x14: Not-equal (unordered, signaling) \n
1713/// 0x15: Not-less-than (unordered, non-signaling) \n
1714/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1715/// 0x17: Ordered (signaling) \n
1716/// 0x18: Equal (unordered, signaling) \n
1717/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1718/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1719/// 0x1B: False (ordered, signaling) \n
1720/// 0x1C: Not-equal (ordered, signaling) \n
1721/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1722/// 0x1E: Greater-than (ordered, non-signaling) \n
1723/// 0x1F: True (unordered, signaling)
1724/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1725/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1726
1727/// Compares each of the corresponding double-precision values of two
1728/// 256-bit vectors of [4 x double], using the operation specified by the
1729/// immediate integer operand.
1730///
1731/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1732/// If either value in a comparison is NaN, comparisons that are ordered
1733/// return false, and comparisons that are unordered return true.
1734///
1735/// \headerfile <x86intrin.h>
1736///
1737/// \code
1738/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1739/// \endcode
1740///
1741/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1742///
1743/// \param a
1744/// A 256-bit vector of [4 x double].
1745/// \param b
1746/// A 256-bit vector of [4 x double].
1747/// \param c
1748/// An immediate integer operand, with bits [4:0] specifying which comparison
1749/// operation to use: \n
1750/// 0x00: Equal (ordered, non-signaling) \n
1751/// 0x01: Less-than (ordered, signaling) \n
1752/// 0x02: Less-than-or-equal (ordered, signaling) \n
1753/// 0x03: Unordered (non-signaling) \n
1754/// 0x04: Not-equal (unordered, non-signaling) \n
1755/// 0x05: Not-less-than (unordered, signaling) \n
1756/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1757/// 0x07: Ordered (non-signaling) \n
1758/// 0x08: Equal (unordered, non-signaling) \n
1759/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1760/// 0x0A: Not-greater-than (unordered, signaling) \n
1761/// 0x0B: False (ordered, non-signaling) \n
1762/// 0x0C: Not-equal (ordered, non-signaling) \n
1763/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1764/// 0x0E: Greater-than (ordered, signaling) \n
1765/// 0x0F: True (unordered, non-signaling) \n
1766/// 0x10: Equal (ordered, signaling) \n
1767/// 0x11: Less-than (ordered, non-signaling) \n
1768/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1769/// 0x13: Unordered (signaling) \n
1770/// 0x14: Not-equal (unordered, signaling) \n
1771/// 0x15: Not-less-than (unordered, non-signaling) \n
1772/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1773/// 0x17: Ordered (signaling) \n
1774/// 0x18: Equal (unordered, signaling) \n
1775/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1776/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1777/// 0x1B: False (ordered, signaling) \n
1778/// 0x1C: Not-equal (ordered, signaling) \n
1779/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1780/// 0x1E: Greater-than (ordered, non-signaling) \n
1781/// 0x1F: True (unordered, signaling)
1782/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1783#define _mm256_cmp_pd(a, b, c) \
1784 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1785 (__v4df)(__m256d)(b), (c)))
1786
1787/// Compares each of the corresponding values of two 256-bit vectors of
1788/// [8 x float], using the operation specified by the immediate integer
1789/// operand.
1790///
1791/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1792/// If either value in a comparison is NaN, comparisons that are ordered
1793/// return false, and comparisons that are unordered return true.
1794///
1795/// \headerfile <x86intrin.h>
1796///
1797/// \code
1798/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1799/// \endcode
1800///
1801/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1802///
1803/// \param a
1804/// A 256-bit vector of [8 x float].
1805/// \param b
1806/// A 256-bit vector of [8 x float].
1807/// \param c
1808/// An immediate integer operand, with bits [4:0] specifying which comparison
1809/// operation to use: \n
1810/// 0x00: Equal (ordered, non-signaling) \n
1811/// 0x01: Less-than (ordered, signaling) \n
1812/// 0x02: Less-than-or-equal (ordered, signaling) \n
1813/// 0x03: Unordered (non-signaling) \n
1814/// 0x04: Not-equal (unordered, non-signaling) \n
1815/// 0x05: Not-less-than (unordered, signaling) \n
1816/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1817/// 0x07: Ordered (non-signaling) \n
1818/// 0x08: Equal (unordered, non-signaling) \n
1819/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1820/// 0x0A: Not-greater-than (unordered, signaling) \n
1821/// 0x0B: False (ordered, non-signaling) \n
1822/// 0x0C: Not-equal (ordered, non-signaling) \n
1823/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1824/// 0x0E: Greater-than (ordered, signaling) \n
1825/// 0x0F: True (unordered, non-signaling) \n
1826/// 0x10: Equal (ordered, signaling) \n
1827/// 0x11: Less-than (ordered, non-signaling) \n
1828/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1829/// 0x13: Unordered (signaling) \n
1830/// 0x14: Not-equal (unordered, signaling) \n
1831/// 0x15: Not-less-than (unordered, non-signaling) \n
1832/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1833/// 0x17: Ordered (signaling) \n
1834/// 0x18: Equal (unordered, signaling) \n
1835/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1836/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1837/// 0x1B: False (ordered, signaling) \n
1838/// 0x1C: Not-equal (ordered, signaling) \n
1839/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1840/// 0x1E: Greater-than (ordered, non-signaling) \n
1841/// 0x1F: True (unordered, signaling)
1842/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1843#define _mm256_cmp_ps(a, b, c) \
1844 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1845 (__v8sf)(__m256)(b), (c)))
1846
1847/* Below intrinsic defined in emmintrin.h can be used for AVX */
1848/// Compares each of the corresponding scalar double-precision values of
1849/// two 128-bit vectors of [2 x double], using the operation specified by the
1850/// immediate integer operand.
1851///
1852/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1853/// If either value in a comparison is NaN, comparisons that are ordered
1854/// return false, and comparisons that are unordered return true.
1855///
1856/// \headerfile <x86intrin.h>
1857///
1858/// \code
1859/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1860/// \endcode
1861///
1862/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1863///
1864/// \param a
1865/// A 128-bit vector of [2 x double].
1866/// \param b
1867/// A 128-bit vector of [2 x double].
1868/// \param c
1869/// An immediate integer operand, with bits [4:0] specifying which comparison
1870/// operation to use: \n
1871/// 0x00: Equal (ordered, non-signaling) \n
1872/// 0x01: Less-than (ordered, signaling) \n
1873/// 0x02: Less-than-or-equal (ordered, signaling) \n
1874/// 0x03: Unordered (non-signaling) \n
1875/// 0x04: Not-equal (unordered, non-signaling) \n
1876/// 0x05: Not-less-than (unordered, signaling) \n
1877/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1878/// 0x07: Ordered (non-signaling) \n
1879/// 0x08: Equal (unordered, non-signaling) \n
1880/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1881/// 0x0A: Not-greater-than (unordered, signaling) \n
1882/// 0x0B: False (ordered, non-signaling) \n
1883/// 0x0C: Not-equal (ordered, non-signaling) \n
1884/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1885/// 0x0E: Greater-than (ordered, signaling) \n
1886/// 0x0F: True (unordered, non-signaling) \n
1887/// 0x10: Equal (ordered, signaling) \n
1888/// 0x11: Less-than (ordered, non-signaling) \n
1889/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1890/// 0x13: Unordered (signaling) \n
1891/// 0x14: Not-equal (unordered, signaling) \n
1892/// 0x15: Not-less-than (unordered, non-signaling) \n
1893/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1894/// 0x17: Ordered (signaling) \n
1895/// 0x18: Equal (unordered, signaling) \n
1896/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1897/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1898/// 0x1B: False (ordered, signaling) \n
1899/// 0x1C: Not-equal (ordered, signaling) \n
1900/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1901/// 0x1E: Greater-than (ordered, non-signaling) \n
1902/// 0x1F: True (unordered, signaling)
1903/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1904/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1905
1906/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1907/// Compares each of the corresponding scalar values of two 128-bit
1908/// vectors of [4 x float], using the operation specified by the immediate
1909/// integer operand.
1910///
1911/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1912/// If either value in a comparison is NaN, comparisons that are ordered
1913/// return false, and comparisons that are unordered return true.
1914///
1915/// \headerfile <x86intrin.h>
1916///
1917/// \code
1918/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1919/// \endcode
1920///
1921/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1922///
1923/// \param a
1924/// A 128-bit vector of [4 x float].
1925/// \param b
1926/// A 128-bit vector of [4 x float].
1927/// \param c
1928/// An immediate integer operand, with bits [4:0] specifying which comparison
1929/// operation to use: \n
1930/// 0x00: Equal (ordered, non-signaling) \n
1931/// 0x01: Less-than (ordered, signaling) \n
1932/// 0x02: Less-than-or-equal (ordered, signaling) \n
1933/// 0x03: Unordered (non-signaling) \n
1934/// 0x04: Not-equal (unordered, non-signaling) \n
1935/// 0x05: Not-less-than (unordered, signaling) \n
1936/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1937/// 0x07: Ordered (non-signaling) \n
1938/// 0x08: Equal (unordered, non-signaling) \n
1939/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1940/// 0x0A: Not-greater-than (unordered, signaling) \n
1941/// 0x0B: False (ordered, non-signaling) \n
1942/// 0x0C: Not-equal (ordered, non-signaling) \n
1943/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1944/// 0x0E: Greater-than (ordered, signaling) \n
1945/// 0x0F: True (unordered, non-signaling) \n
1946/// 0x10: Equal (ordered, signaling) \n
1947/// 0x11: Less-than (ordered, non-signaling) \n
1948/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1949/// 0x13: Unordered (signaling) \n
1950/// 0x14: Not-equal (unordered, signaling) \n
1951/// 0x15: Not-less-than (unordered, non-signaling) \n
1952/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1953/// 0x17: Ordered (signaling) \n
1954/// 0x18: Equal (unordered, signaling) \n
1955/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1956/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1957/// 0x1B: False (ordered, signaling) \n
1958/// 0x1C: Not-equal (ordered, signaling) \n
1959/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1960/// 0x1E: Greater-than (ordered, non-signaling) \n
1961/// 0x1F: True (unordered, signaling)
1962/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1963/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1964
1965/// Takes a [8 x i32] vector and returns the vector element value
1966/// indexed by the immediate constant operand.
1967///
1968/// \headerfile <x86intrin.h>
1969///
1970/// \code
1971/// int _mm256_extract_epi32(__m256i X, const int N);
1972/// \endcode
1973///
1974/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1975/// instruction.
1976///
1977/// \param X
1978/// A 256-bit vector of [8 x i32].
1979/// \param N
1980/// An immediate integer operand with bits [2:0] determining which vector
1981/// element is extracted and returned.
1982/// \returns A 32-bit integer containing the extracted 32 bits of extended
1983/// packed data.
1984#define _mm256_extract_epi32(X, N) \
1985 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1986
1987/// Takes a [16 x i16] vector and returns the vector element value
1988/// indexed by the immediate constant operand.
1989///
1990/// \headerfile <x86intrin.h>
1991///
1992/// \code
1993/// int _mm256_extract_epi16(__m256i X, const int N);
1994/// \endcode
1995///
1996/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1997/// instruction.
1998///
1999/// \param X
2000/// A 256-bit integer vector of [16 x i16].
2001/// \param N
2002/// An immediate integer operand with bits [3:0] determining which vector
2003/// element is extracted and returned.
2004/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2005/// packed data.
2006#define _mm256_extract_epi16(X, N) \
2007 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2008 (int)(N)))
2009
2010/// Takes a [32 x i8] vector and returns the vector element value
2011/// indexed by the immediate constant operand.
2012///
2013/// \headerfile <x86intrin.h>
2014///
2015/// \code
2016/// int _mm256_extract_epi8(__m256i X, const int N);
2017/// \endcode
2018///
2019/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2020/// instruction.
2021///
2022/// \param X
2023/// A 256-bit integer vector of [32 x i8].
2024/// \param N
2025/// An immediate integer operand with bits [4:0] determining which vector
2026/// element is extracted and returned.
2027/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2028/// packed data.
2029#define _mm256_extract_epi8(X, N) \
2030 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2031 (int)(N)))
2032
2033#ifdef __x86_64__
2034/// Takes a [4 x i64] vector and returns the vector element value
2035/// indexed by the immediate constant operand.
2036///
2037/// \headerfile <x86intrin.h>
2038///
2039/// \code
2040/// long long _mm256_extract_epi64(__m256i X, const int N);
2041/// \endcode
2042///
2043/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2044/// instruction.
2045///
2046/// \param X
2047/// A 256-bit integer vector of [4 x i64].
2048/// \param N
2049/// An immediate integer operand with bits [1:0] determining which vector
2050/// element is extracted and returned.
2051/// \returns A 64-bit integer containing the extracted 64 bits of extended
2052/// packed data.
2053#define _mm256_extract_epi64(X, N) \
2054 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2055#endif
2056
2057/// Takes a [8 x i32] vector and replaces the vector element value
2058/// indexed by the immediate constant operand by a new value. Returns the
2059/// modified vector.
2060///
2061/// \headerfile <x86intrin.h>
2062///
2063/// \code
2064/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2065/// \endcode
2066///
2067/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2068/// instruction.
2069///
2070/// \param X
2071/// A vector of [8 x i32] to be used by the insert operation.
2072/// \param I
2073/// An integer value. The replacement value for the insert operation.
2074/// \param N
2075/// An immediate integer specifying the index of the vector element to be
2076/// replaced.
2077/// \returns A copy of vector \a X, after replacing its element indexed by
2078/// \a N with \a I.
2079#define _mm256_insert_epi32(X, I, N) \
2080 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2081 (int)(I), (int)(N)))
2082
2083
2084/// Takes a [16 x i16] vector and replaces the vector element value
2085/// indexed by the immediate constant operand with a new value. Returns the
2086/// modified vector.
2087///
2088/// \headerfile <x86intrin.h>
2089///
2090/// \code
2091/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2092/// \endcode
2093///
2094/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2095/// instruction.
2096///
2097/// \param X
2098/// A vector of [16 x i16] to be used by the insert operation.
2099/// \param I
2100/// An i16 integer value. The replacement value for the insert operation.
2101/// \param N
2102/// An immediate integer specifying the index of the vector element to be
2103/// replaced.
2104/// \returns A copy of vector \a X, after replacing its element indexed by
2105/// \a N with \a I.
2106#define _mm256_insert_epi16(X, I, N) \
2107 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2108 (int)(I), (int)(N)))
2109
2110/// Takes a [32 x i8] vector and replaces the vector element value
2111/// indexed by the immediate constant operand with a new value. Returns the
2112/// modified vector.
2113///
2114/// \headerfile <x86intrin.h>
2115///
2116/// \code
2117/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2118/// \endcode
2119///
2120/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2121/// instruction.
2122///
2123/// \param X
2124/// A vector of [32 x i8] to be used by the insert operation.
2125/// \param I
2126/// An i8 integer value. The replacement value for the insert operation.
2127/// \param N
2128/// An immediate integer specifying the index of the vector element to be
2129/// replaced.
2130/// \returns A copy of vector \a X, after replacing its element indexed by
2131/// \a N with \a I.
2132#define _mm256_insert_epi8(X, I, N) \
2133 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2134 (int)(I), (int)(N)))
2135
2136#ifdef __x86_64__
2137/// Takes a [4 x i64] vector and replaces the vector element value
2138/// indexed by the immediate constant operand with a new value. Returns the
2139/// modified vector.
2140///
2141/// \headerfile <x86intrin.h>
2142///
2143/// \code
2144/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2145/// \endcode
2146///
2147/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2148/// instruction.
2149///
2150/// \param X
2151/// A vector of [4 x i64] to be used by the insert operation.
2152/// \param I
2153/// A 64-bit integer value. The replacement value for the insert operation.
2154/// \param N
2155/// An immediate integer specifying the index of the vector element to be
2156/// replaced.
2157/// \returns A copy of vector \a X, after replacing its element indexed by
2158/// \a N with \a I.
2159#define _mm256_insert_epi64(X, I, N) \
2160 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2161 (long long)(I), (int)(N)))
2162#endif
2163
2164/* Conversion */
2165/// Converts a vector of [4 x i32] into a vector of [4 x double].
2166///
2167/// \headerfile <x86intrin.h>
2168///
2169/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2170///
2171/// \param __a
2172/// A 128-bit integer vector of [4 x i32].
2173/// \returns A 256-bit vector of [4 x double] containing the converted values.
2174static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2176 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2177}
2178
2179/// Converts a vector of [8 x i32] into a vector of [8 x float].
2180///
2181/// \headerfile <x86intrin.h>
2182///
2183/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2184///
2185/// \param __a
2186/// A 256-bit integer vector.
2187/// \returns A 256-bit vector of [8 x float] containing the converted values.
2188static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2190 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2191}
2192
2193/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2194/// [4 x float].
2195///
2196/// \headerfile <x86intrin.h>
2197///
2198/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2199///
2200/// \param __a
2201/// A 256-bit vector of [4 x double].
2202/// \returns A 128-bit vector of [4 x float] containing the converted values.
2203static __inline __m128 __DEFAULT_FN_ATTRS
2205{
2206 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2207}
2208
2209/// Converts a vector of [8 x float] into a vector of [8 x i32].
2210///
2211/// If a converted value does not fit in a 32-bit integer, raises a
2212/// floating-point invalid exception. If the exception is masked, returns
2213/// the most negative integer.
2214///
2215/// \headerfile <x86intrin.h>
2216///
2217/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2218///
2219/// \param __a
2220/// A 256-bit vector of [8 x float].
2221/// \returns A 256-bit integer vector containing the converted values.
2222static __inline __m256i __DEFAULT_FN_ATTRS
2224{
2225 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2226}
2227
2228/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2229/// x double].
2230///
2231/// \headerfile <x86intrin.h>
2232///
2233/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2234///
2235/// \param __a
2236/// A 128-bit vector of [4 x float].
2237/// \returns A 256-bit vector of [4 x double] containing the converted values.
2238static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2240 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2241}
2242
2243/// Converts a 256-bit vector of [4 x double] into four signed truncated
2244/// (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2245/// [4 x i32].
2246///
2247/// If a converted value does not fit in a 32-bit integer, raises a
2248/// floating-point invalid exception. If the exception is masked, returns
2249/// the most negative integer.
2250///
2251/// \headerfile <x86intrin.h>
2252///
2253/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2254///
2255/// \param __a
2256/// A 256-bit vector of [4 x double].
2257/// \returns A 128-bit integer vector containing the converted values.
2258static __inline __m128i __DEFAULT_FN_ATTRS
2260{
2261 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2262}
2263
2264/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2265/// [4 x i32].
2266///
2267/// If a converted value does not fit in a 32-bit integer, raises a
2268/// floating-point invalid exception. If the exception is masked, returns
2269/// the most negative integer.
2270///
2271/// \headerfile <x86intrin.h>
2272///
2273/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2274///
2275/// \param __a
2276/// A 256-bit vector of [4 x double].
2277/// \returns A 128-bit integer vector containing the converted values.
2278static __inline __m128i __DEFAULT_FN_ATTRS
2280{
2281 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2282}
2283
2284/// Converts a vector of [8 x float] into eight signed truncated (rounded
2285/// toward zero) 32-bit integers returned in a vector of [8 x i32].
2286///
2287/// If a converted value does not fit in a 32-bit integer, raises a
2288/// floating-point invalid exception. If the exception is masked, returns
2289/// the most negative integer.
2290///
2291/// \headerfile <x86intrin.h>
2292///
2293/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2294///
2295/// \param __a
2296/// A 256-bit vector of [8 x float].
2297/// \returns A 256-bit integer vector containing the converted values.
2298static __inline __m256i __DEFAULT_FN_ATTRS
2300{
2301 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2302}
2303
2304/// Returns the first element of the input vector of [4 x double].
2305///
2306/// \headerfile <x86intrin.h>
2307///
2308/// This intrinsic is a utility function and does not correspond to a specific
2309/// instruction.
2310///
2311/// \param __a
2312/// A 256-bit vector of [4 x double].
2313/// \returns A 64 bit double containing the first element of the input vector.
2314static __inline double __DEFAULT_FN_ATTRS
2316{
2317 return __a[0];
2318}
2319
2320/// Returns the first element of the input vector of [8 x i32].
2321///
2322/// \headerfile <x86intrin.h>
2323///
2324/// This intrinsic is a utility function and does not correspond to a specific
2325/// instruction.
2326///
2327/// \param __a
2328/// A 256-bit vector of [8 x i32].
2329/// \returns A 32 bit integer containing the first element of the input vector.
2330static __inline int __DEFAULT_FN_ATTRS
2332{
2333 __v8si __b = (__v8si)__a;
2334 return __b[0];
2335}
2336
2337/// Returns the first element of the input vector of [8 x float].
2338///
2339/// \headerfile <x86intrin.h>
2340///
2341/// This intrinsic is a utility function and does not correspond to a specific
2342/// instruction.
2343///
2344/// \param __a
2345/// A 256-bit vector of [8 x float].
2346/// \returns A 32 bit float containing the first element of the input vector.
2347static __inline float __DEFAULT_FN_ATTRS
2349{
2350 return __a[0];
2351}
2352
2353/* Vector replicate */
2354/// Moves and duplicates odd-indexed values from a 256-bit vector of
2355/// [8 x float] to float values in a 256-bit vector of [8 x float].
2356///
2357/// \headerfile <x86intrin.h>
2358///
2359/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2360///
2361/// \param __a
2362/// A 256-bit vector of [8 x float]. \n
2363/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2364/// the return value. \n
2365/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2366/// the return value. \n
2367/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2368/// return value. \n
2369/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2370/// return value.
2371/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2372/// values.
2373static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2375{
2376 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2377}
2378
2379/// Moves and duplicates even-indexed values from a 256-bit vector of
2380/// [8 x float] to float values in a 256-bit vector of [8 x float].
2381///
2382/// \headerfile <x86intrin.h>
2383///
2384/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2385///
2386/// \param __a
2387/// A 256-bit vector of [8 x float]. \n
2388/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2389/// the return value. \n
2390/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2391/// the return value. \n
2392/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2393/// return value. \n
2394/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2395/// return value.
2396/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2397/// values.
2398static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2400{
2401 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2402}
2403
2404/// Moves and duplicates double-precision floating point values from a
2405/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2406/// vector of [4 x double].
2407///
2408/// \headerfile <x86intrin.h>
2409///
2410/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2411///
2412/// \param __a
2413/// A 256-bit vector of [4 x double]. \n
2414/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2415/// return value. \n
2416/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2417/// the return value.
2418/// \returns A 256-bit vector of [4 x double] containing the moved and
2419/// duplicated values.
2420static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2422{
2423 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2424}
2425
2426/* Unpack and Interleave */
2427/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2428/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2429///
2430/// \headerfile <x86intrin.h>
2431///
2432/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2433///
2434/// \param __a
2435/// A 256-bit floating-point vector of [4 x double]. \n
2436/// Bits [127:64] are written to bits [63:0] of the return value. \n
2437/// Bits [255:192] are written to bits [191:128] of the return value. \n
2438/// \param __b
2439/// A 256-bit floating-point vector of [4 x double]. \n
2440/// Bits [127:64] are written to bits [127:64] of the return value. \n
2441/// Bits [255:192] are written to bits [255:192] of the return value. \n
2442/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2443static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2444_mm256_unpackhi_pd(__m256d __a, __m256d __b) {
2445 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2446}
2447
2448/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2449/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2450///
2451/// \headerfile <x86intrin.h>
2452///
2453/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2454///
2455/// \param __a
2456/// A 256-bit floating-point vector of [4 x double]. \n
2457/// Bits [63:0] are written to bits [63:0] of the return value. \n
2458/// Bits [191:128] are written to bits [191:128] of the return value.
2459/// \param __b
2460/// A 256-bit floating-point vector of [4 x double]. \n
2461/// Bits [63:0] are written to bits [127:64] of the return value. \n
2462/// Bits [191:128] are written to bits [255:192] of the return value. \n
2463/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2464static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2465_mm256_unpacklo_pd(__m256d __a, __m256d __b) {
2466 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2467}
2468
2469/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2470/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2471/// vector of [8 x float].
2472///
2473/// \headerfile <x86intrin.h>
2474///
2475/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2476///
2477/// \param __a
2478/// A 256-bit vector of [8 x float]. \n
2479/// Bits [95:64] are written to bits [31:0] of the return value. \n
2480/// Bits [127:96] are written to bits [95:64] of the return value. \n
2481/// Bits [223:192] are written to bits [159:128] of the return value. \n
2482/// Bits [255:224] are written to bits [223:192] of the return value.
2483/// \param __b
2484/// A 256-bit vector of [8 x float]. \n
2485/// Bits [95:64] are written to bits [63:32] of the return value. \n
2486/// Bits [127:96] are written to bits [127:96] of the return value. \n
2487/// Bits [223:192] are written to bits [191:160] of the return value. \n
2488/// Bits [255:224] are written to bits [255:224] of the return value.
2489/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2490static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2491_mm256_unpackhi_ps(__m256 __a, __m256 __b) {
2492 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2493}
2494
2495/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2496/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2497/// vector of [8 x float].
2498///
2499/// \headerfile <x86intrin.h>
2500///
2501/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2502///
2503/// \param __a
2504/// A 256-bit vector of [8 x float]. \n
2505/// Bits [31:0] are written to bits [31:0] of the return value. \n
2506/// Bits [63:32] are written to bits [95:64] of the return value. \n
2507/// Bits [159:128] are written to bits [159:128] of the return value. \n
2508/// Bits [191:160] are written to bits [223:192] of the return value.
2509/// \param __b
2510/// A 256-bit vector of [8 x float]. \n
2511/// Bits [31:0] are written to bits [63:32] of the return value. \n
2512/// Bits [63:32] are written to bits [127:96] of the return value. \n
2513/// Bits [159:128] are written to bits [191:160] of the return value. \n
2514/// Bits [191:160] are written to bits [255:224] of the return value.
2515/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2516static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2517_mm256_unpacklo_ps(__m256 __a, __m256 __b) {
2518 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2519}
2520
2521/* Bit Test */
2522/// Given two 128-bit floating-point vectors of [2 x double], perform an
2523/// element-by-element comparison of the double-precision element in the
2524/// first source vector and the corresponding element in the second source
2525/// vector.
2526///
2527/// The EFLAGS register is updated as follows: \n
2528/// If there is at least one pair of double-precision elements where the
2529/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2530/// ZF flag is set to 1. \n
2531/// If there is at least one pair of double-precision elements where the
2532/// sign-bit of the first element is 0 and the sign-bit of the second element
2533/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2534/// This intrinsic returns the value of the ZF flag.
2535///
2536/// \headerfile <x86intrin.h>
2537///
2538/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2539///
2540/// \param __a
2541/// A 128-bit vector of [2 x double].
2542/// \param __b
2543/// A 128-bit vector of [2 x double].
2544/// \returns the ZF flag in the EFLAGS register.
2545static __inline int __DEFAULT_FN_ATTRS128
2546_mm_testz_pd(__m128d __a, __m128d __b)
2547{
2548 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2549}
2550
2551/// Given two 128-bit floating-point vectors of [2 x double], perform an
2552/// element-by-element comparison of the double-precision element in the
2553/// first source vector and the corresponding element in the second source
2554/// vector.
2555///
2556/// The EFLAGS register is updated as follows: \n
2557/// If there is at least one pair of double-precision elements where the
2558/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2559/// ZF flag is set to 1. \n
2560/// If there is at least one pair of double-precision elements where the
2561/// sign-bit of the first element is 0 and the sign-bit of the second element
2562/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2563/// This intrinsic returns the value of the CF flag.
2564///
2565/// \headerfile <x86intrin.h>
2566///
2567/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2568///
2569/// \param __a
2570/// A 128-bit vector of [2 x double].
2571/// \param __b
2572/// A 128-bit vector of [2 x double].
2573/// \returns the CF flag in the EFLAGS register.
2574static __inline int __DEFAULT_FN_ATTRS128
2575_mm_testc_pd(__m128d __a, __m128d __b)
2576{
2577 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2578}
2579
2580/// Given two 128-bit floating-point vectors of [2 x double], perform an
2581/// element-by-element comparison of the double-precision element in the
2582/// first source vector and the corresponding element in the second source
2583/// vector.
2584///
2585/// The EFLAGS register is updated as follows: \n
2586/// If there is at least one pair of double-precision elements where the
2587/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2588/// ZF flag is set to 1. \n
2589/// If there is at least one pair of double-precision elements where the
2590/// sign-bit of the first element is 0 and the sign-bit of the second element
2591/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2592/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2593/// otherwise it returns 0.
2594///
2595/// \headerfile <x86intrin.h>
2596///
2597/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2598///
2599/// \param __a
2600/// A 128-bit vector of [2 x double].
2601/// \param __b
2602/// A 128-bit vector of [2 x double].
2603/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2604static __inline int __DEFAULT_FN_ATTRS128
2605_mm_testnzc_pd(__m128d __a, __m128d __b)
2606{
2607 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2608}
2609
2610/// Given two 128-bit floating-point vectors of [4 x float], perform an
2611/// element-by-element comparison of the single-precision element in the
2612/// first source vector and the corresponding element in the second source
2613/// vector.
2614///
2615/// The EFLAGS register is updated as follows: \n
2616/// If there is at least one pair of single-precision elements where the
2617/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2618/// ZF flag is set to 1. \n
2619/// If there is at least one pair of single-precision elements where the
2620/// sign-bit of the first element is 0 and the sign-bit of the second element
2621/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2622/// This intrinsic returns the value of the ZF flag.
2623///
2624/// \headerfile <x86intrin.h>
2625///
2626/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2627///
2628/// \param __a
2629/// A 128-bit vector of [4 x float].
2630/// \param __b
2631/// A 128-bit vector of [4 x float].
2632/// \returns the ZF flag.
2633static __inline int __DEFAULT_FN_ATTRS128
2634_mm_testz_ps(__m128 __a, __m128 __b)
2635{
2636 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2637}
2638
2639/// Given two 128-bit floating-point vectors of [4 x float], perform an
2640/// element-by-element comparison of the single-precision element in the
2641/// first source vector and the corresponding element in the second source
2642/// vector.
2643///
2644/// The EFLAGS register is updated as follows: \n
2645/// If there is at least one pair of single-precision elements where the
2646/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2647/// ZF flag is set to 1. \n
2648/// If there is at least one pair of single-precision elements where the
2649/// sign-bit of the first element is 0 and the sign-bit of the second element
2650/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2651/// This intrinsic returns the value of the CF flag.
2652///
2653/// \headerfile <x86intrin.h>
2654///
2655/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2656///
2657/// \param __a
2658/// A 128-bit vector of [4 x float].
2659/// \param __b
2660/// A 128-bit vector of [4 x float].
2661/// \returns the CF flag.
2662static __inline int __DEFAULT_FN_ATTRS128
2663_mm_testc_ps(__m128 __a, __m128 __b)
2664{
2665 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2666}
2667
2668/// Given two 128-bit floating-point vectors of [4 x float], perform an
2669/// element-by-element comparison of the single-precision element in the
2670/// first source vector and the corresponding element in the second source
2671/// vector.
2672///
2673/// The EFLAGS register is updated as follows: \n
2674/// If there is at least one pair of single-precision elements where the
2675/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2676/// ZF flag is set to 1. \n
2677/// If there is at least one pair of single-precision elements where the
2678/// sign-bit of the first element is 0 and the sign-bit of the second element
2679/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2680/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2681/// otherwise it returns 0.
2682///
2683/// \headerfile <x86intrin.h>
2684///
2685/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2686///
2687/// \param __a
2688/// A 128-bit vector of [4 x float].
2689/// \param __b
2690/// A 128-bit vector of [4 x float].
2691/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2692static __inline int __DEFAULT_FN_ATTRS128
2693_mm_testnzc_ps(__m128 __a, __m128 __b)
2694{
2695 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2696}
2697
2698/// Given two 256-bit floating-point vectors of [4 x double], perform an
2699/// element-by-element comparison of the double-precision elements in the
2700/// first source vector and the corresponding elements in the second source
2701/// vector.
2702///
2703/// The EFLAGS register is updated as follows: \n
2704/// If there is at least one pair of double-precision elements where the
2705/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2706/// ZF flag is set to 1. \n
2707/// If there is at least one pair of double-precision elements where the
2708/// sign-bit of the first element is 0 and the sign-bit of the second element
2709/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2710/// This intrinsic returns the value of the ZF flag.
2711///
2712/// \headerfile <x86intrin.h>
2713///
2714/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2715///
2716/// \param __a
2717/// A 256-bit vector of [4 x double].
2718/// \param __b
2719/// A 256-bit vector of [4 x double].
2720/// \returns the ZF flag.
2721static __inline int __DEFAULT_FN_ATTRS
2722_mm256_testz_pd(__m256d __a, __m256d __b)
2723{
2724 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2725}
2726
2727/// Given two 256-bit floating-point vectors of [4 x double], perform an
2728/// element-by-element comparison of the double-precision elements in the
2729/// first source vector and the corresponding elements in the second source
2730/// vector.
2731///
2732/// The EFLAGS register is updated as follows: \n
2733/// If there is at least one pair of double-precision elements where the
2734/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2735/// ZF flag is set to 1. \n
2736/// If there is at least one pair of double-precision elements where the
2737/// sign-bit of the first element is 0 and the sign-bit of the second element
2738/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2739/// This intrinsic returns the value of the CF flag.
2740///
2741/// \headerfile <x86intrin.h>
2742///
2743/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2744///
2745/// \param __a
2746/// A 256-bit vector of [4 x double].
2747/// \param __b
2748/// A 256-bit vector of [4 x double].
2749/// \returns the CF flag.
2750static __inline int __DEFAULT_FN_ATTRS
2751_mm256_testc_pd(__m256d __a, __m256d __b)
2752{
2753 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2754}
2755
2756/// Given two 256-bit floating-point vectors of [4 x double], perform an
2757/// element-by-element comparison of the double-precision elements in the
2758/// first source vector and the corresponding elements in the second source
2759/// vector.
2760///
2761/// The EFLAGS register is updated as follows: \n
2762/// If there is at least one pair of double-precision elements where the
2763/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2764/// ZF flag is set to 1. \n
2765/// If there is at least one pair of double-precision elements where the
2766/// sign-bit of the first element is 0 and the sign-bit of the second element
2767/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2768/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2769/// otherwise it returns 0.
2770///
2771/// \headerfile <x86intrin.h>
2772///
2773/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2774///
2775/// \param __a
2776/// A 256-bit vector of [4 x double].
2777/// \param __b
2778/// A 256-bit vector of [4 x double].
2779/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2780static __inline int __DEFAULT_FN_ATTRS
2781_mm256_testnzc_pd(__m256d __a, __m256d __b)
2782{
2783 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2784}
2785
2786/// Given two 256-bit floating-point vectors of [8 x float], perform an
2787/// element-by-element comparison of the single-precision element in the
2788/// first source vector and the corresponding element in the second source
2789/// vector.
2790///
2791/// The EFLAGS register is updated as follows: \n
2792/// If there is at least one pair of single-precision elements where the
2793/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2794/// ZF flag is set to 1. \n
2795/// If there is at least one pair of single-precision elements where the
2796/// sign-bit of the first element is 0 and the sign-bit of the second element
2797/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2798/// This intrinsic returns the value of the ZF flag.
2799///
2800/// \headerfile <x86intrin.h>
2801///
2802/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2803///
2804/// \param __a
2805/// A 256-bit vector of [8 x float].
2806/// \param __b
2807/// A 256-bit vector of [8 x float].
2808/// \returns the ZF flag.
2809static __inline int __DEFAULT_FN_ATTRS
2810_mm256_testz_ps(__m256 __a, __m256 __b)
2811{
2812 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2813}
2814
2815/// Given two 256-bit floating-point vectors of [8 x float], perform an
2816/// element-by-element comparison of the single-precision element in the
2817/// first source vector and the corresponding element in the second source
2818/// vector.
2819///
2820/// The EFLAGS register is updated as follows: \n
2821/// If there is at least one pair of single-precision elements where the
2822/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2823/// ZF flag is set to 1. \n
2824/// If there is at least one pair of single-precision elements where the
2825/// sign-bit of the first element is 0 and the sign-bit of the second element
2826/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2827/// This intrinsic returns the value of the CF flag.
2828///
2829/// \headerfile <x86intrin.h>
2830///
2831/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2832///
2833/// \param __a
2834/// A 256-bit vector of [8 x float].
2835/// \param __b
2836/// A 256-bit vector of [8 x float].
2837/// \returns the CF flag.
2838static __inline int __DEFAULT_FN_ATTRS
2839_mm256_testc_ps(__m256 __a, __m256 __b)
2840{
2841 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2842}
2843
2844/// Given two 256-bit floating-point vectors of [8 x float], perform an
2845/// element-by-element comparison of the single-precision elements in the
2846/// first source vector and the corresponding elements in the second source
2847/// vector.
2848///
2849/// The EFLAGS register is updated as follows: \n
2850/// If there is at least one pair of single-precision elements where the
2851/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2852/// ZF flag is set to 1. \n
2853/// If there is at least one pair of single-precision elements where the
2854/// sign-bit of the first element is 0 and the sign-bit of the second element
2855/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2856/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2857/// otherwise it returns 0.
2858///
2859/// \headerfile <x86intrin.h>
2860///
2861/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2862///
2863/// \param __a
2864/// A 256-bit vector of [8 x float].
2865/// \param __b
2866/// A 256-bit vector of [8 x float].
2867/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2868static __inline int __DEFAULT_FN_ATTRS
2870{
2871 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2872}
2873
2874/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2875/// of the two source vectors.
2876///
2877/// The EFLAGS register is updated as follows: \n
2878/// If there is at least one pair of bits where both bits are 1, the ZF flag
2879/// is set to 0. Otherwise the ZF flag is set to 1. \n
2880/// If there is at least one pair of bits where the bit from the first source
2881/// vector is 0 and the bit from the second source vector is 1, the CF flag
2882/// is set to 0. Otherwise the CF flag is set to 1. \n
2883/// This intrinsic returns the value of the ZF flag.
2884///
2885/// \headerfile <x86intrin.h>
2886///
2887/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2888///
2889/// \param __a
2890/// A 256-bit integer vector.
2891/// \param __b
2892/// A 256-bit integer vector.
2893/// \returns the ZF flag.
2894static __inline int __DEFAULT_FN_ATTRS
2895_mm256_testz_si256(__m256i __a, __m256i __b)
2896{
2897 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2898}
2899
2900/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2901/// of the two source vectors.
2902///
2903/// The EFLAGS register is updated as follows: \n
2904/// If there is at least one pair of bits where both bits are 1, the ZF flag
2905/// is set to 0. Otherwise the ZF flag is set to 1. \n
2906/// If there is at least one pair of bits where the bit from the first source
2907/// vector is 0 and the bit from the second source vector is 1, the CF flag
2908/// is set to 0. Otherwise the CF flag is set to 1. \n
2909/// This intrinsic returns the value of the CF flag.
2910///
2911/// \headerfile <x86intrin.h>
2912///
2913/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2914///
2915/// \param __a
2916/// A 256-bit integer vector.
2917/// \param __b
2918/// A 256-bit integer vector.
2919/// \returns the CF flag.
2920static __inline int __DEFAULT_FN_ATTRS
2921_mm256_testc_si256(__m256i __a, __m256i __b)
2922{
2923 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2924}
2925
2926/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2927/// of the two source vectors.
2928///
2929/// The EFLAGS register is updated as follows: \n
2930/// If there is at least one pair of bits where both bits are 1, the ZF flag
2931/// is set to 0. Otherwise the ZF flag is set to 1. \n
2932/// If there is at least one pair of bits where the bit from the first source
2933/// vector is 0 and the bit from the second source vector is 1, the CF flag
2934/// is set to 0. Otherwise the CF flag is set to 1. \n
2935/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2936/// otherwise it returns 0.
2937///
2938/// \headerfile <x86intrin.h>
2939///
2940/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2941///
2942/// \param __a
2943/// A 256-bit integer vector.
2944/// \param __b
2945/// A 256-bit integer vector.
2946/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2947static __inline int __DEFAULT_FN_ATTRS
2949{
2950 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2951}
2952
2953/* Vector extract sign mask */
2954/// Extracts the sign bits of double-precision floating point elements
2955/// in a 256-bit vector of [4 x double] and writes them to the lower order
2956/// bits of the return value.
2957///
2958/// \headerfile <x86intrin.h>
2959///
2960/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2961///
2962/// \param __a
2963/// A 256-bit vector of [4 x double] containing the double-precision
2964/// floating point values with sign bits to be extracted.
2965/// \returns The sign bits from the operand, written to bits [3:0].
2966static __inline int __DEFAULT_FN_ATTRS
2968{
2969 return __builtin_ia32_movmskpd256((__v4df)__a);
2970}
2971
2972/// Extracts the sign bits of single-precision floating point elements
2973/// in a 256-bit vector of [8 x float] and writes them to the lower order
2974/// bits of the return value.
2975///
2976/// \headerfile <x86intrin.h>
2977///
2978/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2979///
2980/// \param __a
2981/// A 256-bit vector of [8 x float] containing the single-precision floating
2982/// point values with sign bits to be extracted.
2983/// \returns The sign bits from the operand, written to bits [7:0].
2984static __inline int __DEFAULT_FN_ATTRS
2986{
2987 return __builtin_ia32_movmskps256((__v8sf)__a);
2988}
2989
2990/* Vector __zero */
2991/// Zeroes the contents of all XMM or YMM registers.
2992///
2993/// \headerfile <x86intrin.h>
2994///
2995/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2996static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2997_mm256_zeroall(void)
2998{
2999 __builtin_ia32_vzeroall();
3000}
3001
3002/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
3003///
3004/// \headerfile <x86intrin.h>
3005///
3006/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3007static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3008_mm256_zeroupper(void)
3009{
3010 __builtin_ia32_vzeroupper();
3011}
3012
3013/* Vector load with broadcast */
3014/// Loads a scalar single-precision floating point value from the
3015/// specified address pointed to by \a __a and broadcasts it to the elements
3016/// of a [4 x float] vector.
3017///
3018/// \headerfile <x86intrin.h>
3019///
3020/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3021///
3022/// \param __a
3023/// The single-precision floating point value to be broadcast.
3024/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3025/// equal to the broadcast value.
3026static __inline __m128 __DEFAULT_FN_ATTRS128
3028{
3029 struct __mm_broadcast_ss_struct {
3030 float __f;
3031 } __attribute__((__packed__, __may_alias__));
3032 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3033 return __extension__ (__m128){ __f, __f, __f, __f };
3034}
3035
3036/// Loads a scalar double-precision floating point value from the
3037/// specified address pointed to by \a __a and broadcasts it to the elements
3038/// of a [4 x double] vector.
3039///
3040/// \headerfile <x86intrin.h>
3041///
3042/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3043///
3044/// \param __a
3045/// The double-precision floating point value to be broadcast.
3046/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3047/// equal to the broadcast value.
3048static __inline __m256d __DEFAULT_FN_ATTRS
3050{
3051 struct __mm256_broadcast_sd_struct {
3052 double __d;
3053 } __attribute__((__packed__, __may_alias__));
3054 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3055 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3056}
3057
3058/// Loads a scalar single-precision floating point value from the
3059/// specified address pointed to by \a __a and broadcasts it to the elements
3060/// of a [8 x float] vector.
3061///
3062/// \headerfile <x86intrin.h>
3063///
3064/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3065///
3066/// \param __a
3067/// The single-precision floating point value to be broadcast.
3068/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3069/// equal to the broadcast value.
3070static __inline __m256 __DEFAULT_FN_ATTRS
3072{
3073 struct __mm256_broadcast_ss_struct {
3074 float __f;
3075 } __attribute__((__packed__, __may_alias__));
3076 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3077 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3078}
3079
3080/// Loads the data from a 128-bit vector of [2 x double] from the
3081/// specified address pointed to by \a __a and broadcasts it to 128-bit
3082/// elements in a 256-bit vector of [4 x double].
3083///
3084/// \headerfile <x86intrin.h>
3085///
3086/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3087///
3088/// \param __a
3089/// The 128-bit vector of [2 x double] to be broadcast.
3090/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3091/// equal to the broadcast value.
3092static __inline __m256d __DEFAULT_FN_ATTRS
3094{
3095 __m128d __b = _mm_loadu_pd((const double *)__a);
3096 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3097 0, 1, 0, 1);
3098}
3099
3100/// Loads the data from a 128-bit vector of [4 x float] from the
3101/// specified address pointed to by \a __a and broadcasts it to 128-bit
3102/// elements in a 256-bit vector of [8 x float].
3103///
3104/// \headerfile <x86intrin.h>
3105///
3106/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3107///
3108/// \param __a
3109/// The 128-bit vector of [4 x float] to be broadcast.
3110/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3111/// equal to the broadcast value.
3112static __inline __m256 __DEFAULT_FN_ATTRS
3114{
3115 __m128 __b = _mm_loadu_ps((const float *)__a);
3116 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3117 0, 1, 2, 3, 0, 1, 2, 3);
3118}
3119
3120/* SIMD load ops */
3121/// Loads 4 double-precision floating point values from a 32-byte aligned
3122/// memory location pointed to by \a __p into a vector of [4 x double].
3123///
3124/// \headerfile <x86intrin.h>
3125///
3126/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3127///
3128/// \param __p
3129/// A 32-byte aligned pointer to a memory location containing
3130/// double-precision floating point values.
3131/// \returns A 256-bit vector of [4 x double] containing the moved values.
3132static __inline __m256d __DEFAULT_FN_ATTRS
3133_mm256_load_pd(double const *__p)
3134{
3135 return *(const __m256d *)__p;
3136}
3137
3138/// Loads 8 single-precision floating point values from a 32-byte aligned
3139/// memory location pointed to by \a __p into a vector of [8 x float].
3140///
3141/// \headerfile <x86intrin.h>
3142///
3143/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3144///
3145/// \param __p
3146/// A 32-byte aligned pointer to a memory location containing float values.
3147/// \returns A 256-bit vector of [8 x float] containing the moved values.
3148static __inline __m256 __DEFAULT_FN_ATTRS
3149_mm256_load_ps(float const *__p)
3150{
3151 return *(const __m256 *)__p;
3152}
3153
3154/// Loads 4 double-precision floating point values from an unaligned
3155/// memory location pointed to by \a __p into a vector of [4 x double].
3156///
3157/// \headerfile <x86intrin.h>
3158///
3159/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3160///
3161/// \param __p
3162/// A pointer to a memory location containing double-precision floating
3163/// point values.
3164/// \returns A 256-bit vector of [4 x double] containing the moved values.
3165static __inline __m256d __DEFAULT_FN_ATTRS
3166_mm256_loadu_pd(double const *__p)
3167{
3168 struct __loadu_pd {
3169 __m256d_u __v;
3170 } __attribute__((__packed__, __may_alias__));
3171 return ((const struct __loadu_pd*)__p)->__v;
3172}
3173
3174/// Loads 8 single-precision floating point values from an unaligned
3175/// memory location pointed to by \a __p into a vector of [8 x float].
3176///
3177/// \headerfile <x86intrin.h>
3178///
3179/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3180///
3181/// \param __p
3182/// A pointer to a memory location containing single-precision floating
3183/// point values.
3184/// \returns A 256-bit vector of [8 x float] containing the moved values.
3185static __inline __m256 __DEFAULT_FN_ATTRS
3187{
3188 struct __loadu_ps {
3189 __m256_u __v;
3190 } __attribute__((__packed__, __may_alias__));
3191 return ((const struct __loadu_ps*)__p)->__v;
3192}
3193
3194/// Loads 256 bits of integer data from a 32-byte aligned memory
3195/// location pointed to by \a __p into elements of a 256-bit integer vector.
3196///
3197/// \headerfile <x86intrin.h>
3198///
3199/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3200///
3201/// \param __p
3202/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3203/// values.
3204/// \returns A 256-bit integer vector containing the moved values.
3205static __inline __m256i __DEFAULT_FN_ATTRS
3206_mm256_load_si256(__m256i const *__p)
3207{
3208 return *__p;
3209}
3210
3211/// Loads 256 bits of integer data from an unaligned memory location
3212/// pointed to by \a __p into a 256-bit integer vector.
3213///
3214/// \headerfile <x86intrin.h>
3215///
3216/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3217///
3218/// \param __p
3219/// A pointer to a 256-bit integer vector containing integer values.
3220/// \returns A 256-bit integer vector containing the moved values.
3221static __inline __m256i __DEFAULT_FN_ATTRS
3222_mm256_loadu_si256(__m256i_u const *__p)
3223{
3224 struct __loadu_si256 {
3225 __m256i_u __v;
3226 } __attribute__((__packed__, __may_alias__));
3227 return ((const struct __loadu_si256*)__p)->__v;
3228}
3229
3230/// Loads 256 bits of integer data from an unaligned memory location
3231/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3232/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3233/// line boundary.
3234///
3235/// \headerfile <x86intrin.h>
3236///
3237/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3238///
3239/// \param __p
3240/// A pointer to a 256-bit integer vector containing integer values.
3241/// \returns A 256-bit integer vector containing the moved values.
3242static __inline __m256i __DEFAULT_FN_ATTRS
3243_mm256_lddqu_si256(__m256i_u const *__p)
3244{
3245 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3246}
3247
3248/* SIMD store ops */
3249/// Stores double-precision floating point values from a 256-bit vector
3250/// of [4 x double] to a 32-byte aligned memory location pointed to by
3251/// \a __p.
3252///
3253/// \headerfile <x86intrin.h>
3254///
3255/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3256///
3257/// \param __p
3258/// A 32-byte aligned pointer to a memory location that will receive the
3259/// double-precision floaing point values.
3260/// \param __a
3261/// A 256-bit vector of [4 x double] containing the values to be moved.
3262static __inline void __DEFAULT_FN_ATTRS
3263_mm256_store_pd(double *__p, __m256d __a)
3264{
3265 *(__m256d *)__p = __a;
3266}
3267
3268/// Stores single-precision floating point values from a 256-bit vector
3269/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3270///
3271/// \headerfile <x86intrin.h>
3272///
3273/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3274///
3275/// \param __p
3276/// A 32-byte aligned pointer to a memory location that will receive the
3277/// float values.
3278/// \param __a
3279/// A 256-bit vector of [8 x float] containing the values to be moved.
3280static __inline void __DEFAULT_FN_ATTRS
3281_mm256_store_ps(float *__p, __m256 __a)
3282{
3283 *(__m256 *)__p = __a;
3284}
3285
3286/// Stores double-precision floating point values from a 256-bit vector
3287/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3288///
3289/// \headerfile <x86intrin.h>
3290///
3291/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3292///
3293/// \param __p
3294/// A pointer to a memory location that will receive the double-precision
3295/// floating point values.
3296/// \param __a
3297/// A 256-bit vector of [4 x double] containing the values to be moved.
3298static __inline void __DEFAULT_FN_ATTRS
3299_mm256_storeu_pd(double *__p, __m256d __a)
3300{
3301 struct __storeu_pd {
3302 __m256d_u __v;
3303 } __attribute__((__packed__, __may_alias__));
3304 ((struct __storeu_pd*)__p)->__v = __a;
3305}
3306
3307/// Stores single-precision floating point values from a 256-bit vector
3308/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3309///
3310/// \headerfile <x86intrin.h>
3311///
3312/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3313///
3314/// \param __p
3315/// A pointer to a memory location that will receive the float values.
3316/// \param __a
3317/// A 256-bit vector of [8 x float] containing the values to be moved.
3318static __inline void __DEFAULT_FN_ATTRS
3319_mm256_storeu_ps(float *__p, __m256 __a)
3320{
3321 struct __storeu_ps {
3322 __m256_u __v;
3323 } __attribute__((__packed__, __may_alias__));
3324 ((struct __storeu_ps*)__p)->__v = __a;
3325}
3326
3327/// Stores integer values from a 256-bit integer vector to a 32-byte
3328/// aligned memory location pointed to by \a __p.
3329///
3330/// \headerfile <x86intrin.h>
3331///
3332/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3333///
3334/// \param __p
3335/// A 32-byte aligned pointer to a memory location that will receive the
3336/// integer values.
3337/// \param __a
3338/// A 256-bit integer vector containing the values to be moved.
3339static __inline void __DEFAULT_FN_ATTRS
3340_mm256_store_si256(__m256i *__p, __m256i __a)
3341{
3342 *__p = __a;
3343}
3344
3345/// Stores integer values from a 256-bit integer vector to an unaligned
3346/// memory location pointed to by \a __p.
3347///
3348/// \headerfile <x86intrin.h>
3349///
3350/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3351///
3352/// \param __p
3353/// A pointer to a memory location that will receive the integer values.
3354/// \param __a
3355/// A 256-bit integer vector containing the values to be moved.
3356static __inline void __DEFAULT_FN_ATTRS
3357_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3358{
3359 struct __storeu_si256 {
3360 __m256i_u __v;
3361 } __attribute__((__packed__, __may_alias__));
3362 ((struct __storeu_si256*)__p)->__v = __a;
3363}
3364
3365/* Conditional load ops */
3366/// Conditionally loads double-precision floating point elements from a
3367/// memory location pointed to by \a __p into a 128-bit vector of
3368/// [2 x double], depending on the mask bits associated with each data
3369/// element.
3370///
3371/// \headerfile <x86intrin.h>
3372///
3373/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3374///
3375/// \param __p
3376/// A pointer to a memory location that contains the double-precision
3377/// floating point values.
3378/// \param __m
3379/// A 128-bit integer vector containing the mask. The most significant bit of
3380/// each data element represents the mask bits. If a mask bit is zero, the
3381/// corresponding value in the memory location is not loaded and the
3382/// corresponding field in the return value is set to zero.
3383/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3384static __inline __m128d __DEFAULT_FN_ATTRS128
3385_mm_maskload_pd(double const *__p, __m128i __m)
3386{
3387 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3388}
3389
3390/// Conditionally loads double-precision floating point elements from a
3391/// memory location pointed to by \a __p into a 256-bit vector of
3392/// [4 x double], depending on the mask bits associated with each data
3393/// element.
3394///
3395/// \headerfile <x86intrin.h>
3396///
3397/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3398///
3399/// \param __p
3400/// A pointer to a memory location that contains the double-precision
3401/// floating point values.
3402/// \param __m
3403/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3404/// significant bit of each quadword element represents the mask bits. If a
3405/// mask bit is zero, the corresponding value in the memory location is not
3406/// loaded and the corresponding field in the return value is set to zero.
3407/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3408static __inline __m256d __DEFAULT_FN_ATTRS
3409_mm256_maskload_pd(double const *__p, __m256i __m)
3410{
3411 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3412 (__v4di)__m);
3413}
3414
3415/// Conditionally loads single-precision floating point elements from a
3416/// memory location pointed to by \a __p into a 128-bit vector of
3417/// [4 x float], depending on the mask bits associated with each data
3418/// element.
3419///
3420/// \headerfile <x86intrin.h>
3421///
3422/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3423///
3424/// \param __p
3425/// A pointer to a memory location that contains the single-precision
3426/// floating point values.
3427/// \param __m
3428/// A 128-bit integer vector containing the mask. The most significant bit of
3429/// each data element represents the mask bits. If a mask bit is zero, the
3430/// corresponding value in the memory location is not loaded and the
3431/// corresponding field in the return value is set to zero.
3432/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3433static __inline __m128 __DEFAULT_FN_ATTRS128
3434_mm_maskload_ps(float const *__p, __m128i __m)
3435{
3436 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3437}
3438
3439/// Conditionally loads single-precision floating point elements from a
3440/// memory location pointed to by \a __p into a 256-bit vector of
3441/// [8 x float], depending on the mask bits associated with each data
3442/// element.
3443///
3444/// \headerfile <x86intrin.h>
3445///
3446/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3447///
3448/// \param __p
3449/// A pointer to a memory location that contains the single-precision
3450/// floating point values.
3451/// \param __m
3452/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3453/// significant bit of each dword element represents the mask bits. If a mask
3454/// bit is zero, the corresponding value in the memory location is not loaded
3455/// and the corresponding field in the return value is set to zero.
3456/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3457static __inline __m256 __DEFAULT_FN_ATTRS
3458_mm256_maskload_ps(float const *__p, __m256i __m)
3459{
3460 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3461}
3462
3463/* Conditional store ops */
3464/// Moves single-precision floating point values from a 256-bit vector
3465/// of [8 x float] to a memory location pointed to by \a __p, according to
3466/// the specified mask.
3467///
3468/// \headerfile <x86intrin.h>
3469///
3470/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3471///
3472/// \param __p
3473/// A pointer to a memory location that will receive the float values.
3474/// \param __m
3475/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3476/// significant bit of each dword element in the mask vector represents the
3477/// mask bits. If a mask bit is zero, the corresponding value from vector
3478/// \a __a is not stored and the corresponding field in the memory location
3479/// pointed to by \a __p is not changed.
3480/// \param __a
3481/// A 256-bit vector of [8 x float] containing the values to be stored.
3482static __inline void __DEFAULT_FN_ATTRS
3483_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3484{
3485 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3486}
3487
3488/// Moves double-precision values from a 128-bit vector of [2 x double]
3489/// to a memory location pointed to by \a __p, according to the specified
3490/// mask.
3491///
3492/// \headerfile <x86intrin.h>
3493///
3494/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3495///
3496/// \param __p
3497/// A pointer to a memory location that will receive the float values.
3498/// \param __m
3499/// A 128-bit integer vector containing the mask. The most significant bit of
3500/// each field in the mask vector represents the mask bits. If a mask bit is
3501/// zero, the corresponding value from vector \a __a is not stored and the
3502/// corresponding field in the memory location pointed to by \a __p is not
3503/// changed.
3504/// \param __a
3505/// A 128-bit vector of [2 x double] containing the values to be stored.
3506static __inline void __DEFAULT_FN_ATTRS128
3507_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3508{
3509 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3510}
3511
3512/// Moves double-precision values from a 256-bit vector of [4 x double]
3513/// to a memory location pointed to by \a __p, according to the specified
3514/// mask.
3515///
3516/// \headerfile <x86intrin.h>
3517///
3518/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3519///
3520/// \param __p
3521/// A pointer to a memory location that will receive the float values.
3522/// \param __m
3523/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3524/// significant bit of each quadword element in the mask vector represents
3525/// the mask bits. If a mask bit is zero, the corresponding value from vector
3526/// __a is not stored and the corresponding field in the memory location
3527/// pointed to by \a __p is not changed.
3528/// \param __a
3529/// A 256-bit vector of [4 x double] containing the values to be stored.
3530static __inline void __DEFAULT_FN_ATTRS
3531_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3532{
3533 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3534}
3535
3536/// Moves single-precision floating point values from a 128-bit vector
3537/// of [4 x float] to a memory location pointed to by \a __p, according to
3538/// the specified mask.
3539///
3540/// \headerfile <x86intrin.h>
3541///
3542/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3543///
3544/// \param __p
3545/// A pointer to a memory location that will receive the float values.
3546/// \param __m
3547/// A 128-bit integer vector containing the mask. The most significant bit of
3548/// each field in the mask vector represents the mask bits. If a mask bit is
3549/// zero, the corresponding value from vector __a is not stored and the
3550/// corresponding field in the memory location pointed to by \a __p is not
3551/// changed.
3552/// \param __a
3553/// A 128-bit vector of [4 x float] containing the values to be stored.
3554static __inline void __DEFAULT_FN_ATTRS128
3555_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3556{
3557 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3558}
3559
3560/* Cacheability support ops */
3561/// Moves integer data from a 256-bit integer vector to a 32-byte
3562/// aligned memory location. To minimize caching, the data is flagged as
3563/// non-temporal (unlikely to be used again soon).
3564///
3565/// \headerfile <x86intrin.h>
3566///
3567/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3568///
3569/// \param __a
3570/// A pointer to a 32-byte aligned memory location that will receive the
3571/// integer values.
3572/// \param __b
3573/// A 256-bit integer vector containing the values to be moved.
3574static __inline void __DEFAULT_FN_ATTRS
3576{
3577 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3578 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3579}
3580
3581/// Moves double-precision values from a 256-bit vector of [4 x double]
3582/// to a 32-byte aligned memory location. To minimize caching, the data is
3583/// flagged as non-temporal (unlikely to be used again soon).
3584///
3585/// \headerfile <x86intrin.h>
3586///
3587/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3588///
3589/// \param __a
3590/// A pointer to a 32-byte aligned memory location that will receive the
3591/// double-precision floating-point values.
3592/// \param __b
3593/// A 256-bit vector of [4 x double] containing the values to be moved.
3594static __inline void __DEFAULT_FN_ATTRS
3595_mm256_stream_pd(void *__a, __m256d __b)
3596{
3597 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3598 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3599}
3600
3601/// Moves single-precision floating point values from a 256-bit vector
3602/// of [8 x float] to a 32-byte aligned memory location. To minimize
3603/// caching, the data is flagged as non-temporal (unlikely to be used again
3604/// soon).
3605///
3606/// \headerfile <x86intrin.h>
3607///
3608/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3609///
3610/// \param __p
3611/// A pointer to a 32-byte aligned memory location that will receive the
3612/// single-precision floating point values.
3613/// \param __a
3614/// A 256-bit vector of [8 x float] containing the values to be moved.
3615static __inline void __DEFAULT_FN_ATTRS
3617{
3618 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3619 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3620}
3621
3622/* Create vectors */
3623/// Create a 256-bit vector of [4 x double] with undefined values.
3624///
3625/// \headerfile <x86intrin.h>
3626///
3627/// This intrinsic has no corresponding instruction.
3628///
3629/// \returns A 256-bit vector of [4 x double] containing undefined values.
3630static __inline__ __m256d __DEFAULT_FN_ATTRS
3632{
3633 return (__m256d)__builtin_ia32_undef256();
3634}
3635
3636/// Create a 256-bit vector of [8 x float] with undefined values.
3637///
3638/// \headerfile <x86intrin.h>
3639///
3640/// This intrinsic has no corresponding instruction.
3641///
3642/// \returns A 256-bit vector of [8 x float] containing undefined values.
3643static __inline__ __m256 __DEFAULT_FN_ATTRS
3645{
3646 return (__m256)__builtin_ia32_undef256();
3647}
3648
3649/// Create a 256-bit integer vector with undefined values.
3650///
3651/// \headerfile <x86intrin.h>
3652///
3653/// This intrinsic has no corresponding instruction.
3654///
3655/// \returns A 256-bit integer vector containing undefined values.
3656static __inline__ __m256i __DEFAULT_FN_ATTRS
3658{
3659 return (__m256i)__builtin_ia32_undef256();
3660}
3661
3662/// Constructs a 256-bit floating-point vector of [4 x double]
3663/// initialized with the specified double-precision floating-point values.
3664///
3665/// \headerfile <x86intrin.h>
3666///
3667/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3668/// instruction.
3669///
3670/// \param __a
3671/// A double-precision floating-point value used to initialize bits [255:192]
3672/// of the result.
3673/// \param __b
3674/// A double-precision floating-point value used to initialize bits [191:128]
3675/// of the result.
3676/// \param __c
3677/// A double-precision floating-point value used to initialize bits [127:64]
3678/// of the result.
3679/// \param __d
3680/// A double-precision floating-point value used to initialize bits [63:0]
3681/// of the result.
3682/// \returns An initialized 256-bit floating-point vector of [4 x double].
3683static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3684_mm256_set_pd(double __a, double __b, double __c, double __d)
3685{
3686 return __extension__ (__m256d){ __d, __c, __b, __a };
3687}
3688
3689/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3690/// with the specified single-precision floating-point values.
3691///
3692/// \headerfile <x86intrin.h>
3693///
3694/// This intrinsic is a utility function and does not correspond to a specific
3695/// instruction.
3696///
3697/// \param __a
3698/// A single-precision floating-point value used to initialize bits [255:224]
3699/// of the result.
3700/// \param __b
3701/// A single-precision floating-point value used to initialize bits [223:192]
3702/// of the result.
3703/// \param __c
3704/// A single-precision floating-point value used to initialize bits [191:160]
3705/// of the result.
3706/// \param __d
3707/// A single-precision floating-point value used to initialize bits [159:128]
3708/// of the result.
3709/// \param __e
3710/// A single-precision floating-point value used to initialize bits [127:96]
3711/// of the result.
3712/// \param __f
3713/// A single-precision floating-point value used to initialize bits [95:64]
3714/// of the result.
3715/// \param __g
3716/// A single-precision floating-point value used to initialize bits [63:32]
3717/// of the result.
3718/// \param __h
3719/// A single-precision floating-point value used to initialize bits [31:0]
3720/// of the result.
3721/// \returns An initialized 256-bit floating-point vector of [8 x float].
3722static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3723_mm256_set_ps(float __a, float __b, float __c, float __d,
3724 float __e, float __f, float __g, float __h)
3725{
3726 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3727}
3728
3729/// Constructs a 256-bit integer vector initialized with the specified
3730/// 32-bit integral values.
3731///
3732/// \headerfile <x86intrin.h>
3733///
3734/// This intrinsic is a utility function and does not correspond to a specific
3735/// instruction.
3736///
3737/// \param __i0
3738/// A 32-bit integral value used to initialize bits [255:224] of the result.
3739/// \param __i1
3740/// A 32-bit integral value used to initialize bits [223:192] of the result.
3741/// \param __i2
3742/// A 32-bit integral value used to initialize bits [191:160] of the result.
3743/// \param __i3
3744/// A 32-bit integral value used to initialize bits [159:128] of the result.
3745/// \param __i4
3746/// A 32-bit integral value used to initialize bits [127:96] of the result.
3747/// \param __i5
3748/// A 32-bit integral value used to initialize bits [95:64] of the result.
3749/// \param __i6
3750/// A 32-bit integral value used to initialize bits [63:32] of the result.
3751/// \param __i7
3752/// A 32-bit integral value used to initialize bits [31:0] of the result.
3753/// \returns An initialized 256-bit integer vector.
3754static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3755_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3756 int __i4, int __i5, int __i6, int __i7)
3757{
3758 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3759}
3760
3761/// Constructs a 256-bit integer vector initialized with the specified
3762/// 16-bit integral values.
3763///
3764/// \headerfile <x86intrin.h>
3765///
3766/// This intrinsic is a utility function and does not correspond to a specific
3767/// instruction.
3768///
3769/// \param __w15
3770/// A 16-bit integral value used to initialize bits [255:240] of the result.
3771/// \param __w14
3772/// A 16-bit integral value used to initialize bits [239:224] of the result.
3773/// \param __w13
3774/// A 16-bit integral value used to initialize bits [223:208] of the result.
3775/// \param __w12
3776/// A 16-bit integral value used to initialize bits [207:192] of the result.
3777/// \param __w11
3778/// A 16-bit integral value used to initialize bits [191:176] of the result.
3779/// \param __w10
3780/// A 16-bit integral value used to initialize bits [175:160] of the result.
3781/// \param __w09
3782/// A 16-bit integral value used to initialize bits [159:144] of the result.
3783/// \param __w08
3784/// A 16-bit integral value used to initialize bits [143:128] of the result.
3785/// \param __w07
3786/// A 16-bit integral value used to initialize bits [127:112] of the result.
3787/// \param __w06
3788/// A 16-bit integral value used to initialize bits [111:96] of the result.
3789/// \param __w05
3790/// A 16-bit integral value used to initialize bits [95:80] of the result.
3791/// \param __w04
3792/// A 16-bit integral value used to initialize bits [79:64] of the result.
3793/// \param __w03
3794/// A 16-bit integral value used to initialize bits [63:48] of the result.
3795/// \param __w02
3796/// A 16-bit integral value used to initialize bits [47:32] of the result.
3797/// \param __w01
3798/// A 16-bit integral value used to initialize bits [31:16] of the result.
3799/// \param __w00
3800/// A 16-bit integral value used to initialize bits [15:0] of the result.
3801/// \returns An initialized 256-bit integer vector.
3802static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3803_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3804 short __w11, short __w10, short __w09, short __w08,
3805 short __w07, short __w06, short __w05, short __w04,
3806 short __w03, short __w02, short __w01, short __w00)
3807{
3808 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3809 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3810}
3811
3812/// Constructs a 256-bit integer vector initialized with the specified
3813/// 8-bit integral values.
3814///
3815/// \headerfile <x86intrin.h>
3816///
3817/// This intrinsic is a utility function and does not correspond to a specific
3818/// instruction.
3819///
3820/// \param __b31
3821/// An 8-bit integral value used to initialize bits [255:248] of the result.
3822/// \param __b30
3823/// An 8-bit integral value used to initialize bits [247:240] of the result.
3824/// \param __b29
3825/// An 8-bit integral value used to initialize bits [239:232] of the result.
3826/// \param __b28
3827/// An 8-bit integral value used to initialize bits [231:224] of the result.
3828/// \param __b27
3829/// An 8-bit integral value used to initialize bits [223:216] of the result.
3830/// \param __b26
3831/// An 8-bit integral value used to initialize bits [215:208] of the result.
3832/// \param __b25
3833/// An 8-bit integral value used to initialize bits [207:200] of the result.
3834/// \param __b24
3835/// An 8-bit integral value used to initialize bits [199:192] of the result.
3836/// \param __b23
3837/// An 8-bit integral value used to initialize bits [191:184] of the result.
3838/// \param __b22
3839/// An 8-bit integral value used to initialize bits [183:176] of the result.
3840/// \param __b21
3841/// An 8-bit integral value used to initialize bits [175:168] of the result.
3842/// \param __b20
3843/// An 8-bit integral value used to initialize bits [167:160] of the result.
3844/// \param __b19
3845/// An 8-bit integral value used to initialize bits [159:152] of the result.
3846/// \param __b18
3847/// An 8-bit integral value used to initialize bits [151:144] of the result.
3848/// \param __b17
3849/// An 8-bit integral value used to initialize bits [143:136] of the result.
3850/// \param __b16
3851/// An 8-bit integral value used to initialize bits [135:128] of the result.
3852/// \param __b15
3853/// An 8-bit integral value used to initialize bits [127:120] of the result.
3854/// \param __b14
3855/// An 8-bit integral value used to initialize bits [119:112] of the result.
3856/// \param __b13
3857/// An 8-bit integral value used to initialize bits [111:104] of the result.
3858/// \param __b12
3859/// An 8-bit integral value used to initialize bits [103:96] of the result.
3860/// \param __b11
3861/// An 8-bit integral value used to initialize bits [95:88] of the result.
3862/// \param __b10
3863/// An 8-bit integral value used to initialize bits [87:80] of the result.
3864/// \param __b09
3865/// An 8-bit integral value used to initialize bits [79:72] of the result.
3866/// \param __b08
3867/// An 8-bit integral value used to initialize bits [71:64] of the result.
3868/// \param __b07
3869/// An 8-bit integral value used to initialize bits [63:56] of the result.
3870/// \param __b06
3871/// An 8-bit integral value used to initialize bits [55:48] of the result.
3872/// \param __b05
3873/// An 8-bit integral value used to initialize bits [47:40] of the result.
3874/// \param __b04
3875/// An 8-bit integral value used to initialize bits [39:32] of the result.
3876/// \param __b03
3877/// An 8-bit integral value used to initialize bits [31:24] of the result.
3878/// \param __b02
3879/// An 8-bit integral value used to initialize bits [23:16] of the result.
3880/// \param __b01
3881/// An 8-bit integral value used to initialize bits [15:8] of the result.
3882/// \param __b00
3883/// An 8-bit integral value used to initialize bits [7:0] of the result.
3884/// \returns An initialized 256-bit integer vector.
3885static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3886_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3887 char __b27, char __b26, char __b25, char __b24,
3888 char __b23, char __b22, char __b21, char __b20,
3889 char __b19, char __b18, char __b17, char __b16,
3890 char __b15, char __b14, char __b13, char __b12,
3891 char __b11, char __b10, char __b09, char __b08,
3892 char __b07, char __b06, char __b05, char __b04,
3893 char __b03, char __b02, char __b01, char __b00)
3894{
3895 return __extension__ (__m256i)(__v32qi){
3896 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3897 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3898 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3899 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3900 };
3901}
3902
3903/// Constructs a 256-bit integer vector initialized with the specified
3904/// 64-bit integral values.
3905///
3906/// \headerfile <x86intrin.h>
3907///
3908/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3909/// instruction.
3910///
3911/// \param __a
3912/// A 64-bit integral value used to initialize bits [255:192] of the result.
3913/// \param __b
3914/// A 64-bit integral value used to initialize bits [191:128] of the result.
3915/// \param __c
3916/// A 64-bit integral value used to initialize bits [127:64] of the result.
3917/// \param __d
3918/// A 64-bit integral value used to initialize bits [63:0] of the result.
3919/// \returns An initialized 256-bit integer vector.
3920static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3921_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3922{
3923 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3924}
3925
3926/* Create vectors with elements in reverse order */
3927/// Constructs a 256-bit floating-point vector of [4 x double],
3928/// initialized in reverse order with the specified double-precision
3929/// floating-point values.
3930///
3931/// \headerfile <x86intrin.h>
3932///
3933/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3934/// instruction.
3935///
3936/// \param __a
3937/// A double-precision floating-point value used to initialize bits [63:0]
3938/// of the result.
3939/// \param __b
3940/// A double-precision floating-point value used to initialize bits [127:64]
3941/// of the result.
3942/// \param __c
3943/// A double-precision floating-point value used to initialize bits [191:128]
3944/// of the result.
3945/// \param __d
3946/// A double-precision floating-point value used to initialize bits [255:192]
3947/// of the result.
3948/// \returns An initialized 256-bit floating-point vector of [4 x double].
3949static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3950_mm256_setr_pd(double __a, double __b, double __c, double __d)
3951{
3952 return _mm256_set_pd(__d, __c, __b, __a);
3953}
3954
3955/// Constructs a 256-bit floating-point vector of [8 x float],
3956/// initialized in reverse order with the specified single-precision
3957/// float-point values.
3958///
3959/// \headerfile <x86intrin.h>
3960///
3961/// This intrinsic is a utility function and does not correspond to a specific
3962/// instruction.
3963///
3964/// \param __a
3965/// A single-precision floating-point value used to initialize bits [31:0]
3966/// of the result.
3967/// \param __b
3968/// A single-precision floating-point value used to initialize bits [63:32]
3969/// of the result.
3970/// \param __c
3971/// A single-precision floating-point value used to initialize bits [95:64]
3972/// of the result.
3973/// \param __d
3974/// A single-precision floating-point value used to initialize bits [127:96]
3975/// of the result.
3976/// \param __e
3977/// A single-precision floating-point value used to initialize bits [159:128]
3978/// of the result.
3979/// \param __f
3980/// A single-precision floating-point value used to initialize bits [191:160]
3981/// of the result.
3982/// \param __g
3983/// A single-precision floating-point value used to initialize bits [223:192]
3984/// of the result.
3985/// \param __h
3986/// A single-precision floating-point value used to initialize bits [255:224]
3987/// of the result.
3988/// \returns An initialized 256-bit floating-point vector of [8 x float].
3989static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3990_mm256_setr_ps(float __a, float __b, float __c, float __d,
3991 float __e, float __f, float __g, float __h)
3992{
3993 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3994}
3995
3996/// Constructs a 256-bit integer vector, initialized in reverse order
3997/// with the specified 32-bit integral values.
3998///
3999/// \headerfile <x86intrin.h>
4000///
4001/// This intrinsic is a utility function and does not correspond to a specific
4002/// instruction.
4003///
4004/// \param __i0
4005/// A 32-bit integral value used to initialize bits [31:0] of the result.
4006/// \param __i1
4007/// A 32-bit integral value used to initialize bits [63:32] of the result.
4008/// \param __i2
4009/// A 32-bit integral value used to initialize bits [95:64] of the result.
4010/// \param __i3
4011/// A 32-bit integral value used to initialize bits [127:96] of the result.
4012/// \param __i4
4013/// A 32-bit integral value used to initialize bits [159:128] of the result.
4014/// \param __i5
4015/// A 32-bit integral value used to initialize bits [191:160] of the result.
4016/// \param __i6
4017/// A 32-bit integral value used to initialize bits [223:192] of the result.
4018/// \param __i7
4019/// A 32-bit integral value used to initialize bits [255:224] of the result.
4020/// \returns An initialized 256-bit integer vector.
4021static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4022_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4023 int __i4, int __i5, int __i6, int __i7)
4024{
4025 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
4026}
4027
4028/// Constructs a 256-bit integer vector, initialized in reverse order
4029/// with the specified 16-bit integral values.
4030///
4031/// \headerfile <x86intrin.h>
4032///
4033/// This intrinsic is a utility function and does not correspond to a specific
4034/// instruction.
4035///
4036/// \param __w15
4037/// A 16-bit integral value used to initialize bits [15:0] of the result.
4038/// \param __w14
4039/// A 16-bit integral value used to initialize bits [31:16] of the result.
4040/// \param __w13
4041/// A 16-bit integral value used to initialize bits [47:32] of the result.
4042/// \param __w12
4043/// A 16-bit integral value used to initialize bits [63:48] of the result.
4044/// \param __w11
4045/// A 16-bit integral value used to initialize bits [79:64] of the result.
4046/// \param __w10
4047/// A 16-bit integral value used to initialize bits [95:80] of the result.
4048/// \param __w09
4049/// A 16-bit integral value used to initialize bits [111:96] of the result.
4050/// \param __w08
4051/// A 16-bit integral value used to initialize bits [127:112] of the result.
4052/// \param __w07
4053/// A 16-bit integral value used to initialize bits [143:128] of the result.
4054/// \param __w06
4055/// A 16-bit integral value used to initialize bits [159:144] of the result.
4056/// \param __w05
4057/// A 16-bit integral value used to initialize bits [175:160] of the result.
4058/// \param __w04
4059/// A 16-bit integral value used to initialize bits [191:176] of the result.
4060/// \param __w03
4061/// A 16-bit integral value used to initialize bits [207:192] of the result.
4062/// \param __w02
4063/// A 16-bit integral value used to initialize bits [223:208] of the result.
4064/// \param __w01
4065/// A 16-bit integral value used to initialize bits [239:224] of the result.
4066/// \param __w00
4067/// A 16-bit integral value used to initialize bits [255:240] of the result.
4068/// \returns An initialized 256-bit integer vector.
4069static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4070_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4071 short __w11, short __w10, short __w09, short __w08,
4072 short __w07, short __w06, short __w05, short __w04,
4073 short __w03, short __w02, short __w01, short __w00)
4074{
4075 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4076 __w04, __w05, __w06, __w07,
4077 __w08, __w09, __w10, __w11,
4078 __w12, __w13, __w14, __w15);
4079}
4080
4081/// Constructs a 256-bit integer vector, initialized in reverse order
4082/// with the specified 8-bit integral values.
4083///
4084/// \headerfile <x86intrin.h>
4085///
4086/// This intrinsic is a utility function and does not correspond to a specific
4087/// instruction.
4088///
4089/// \param __b31
4090/// An 8-bit integral value used to initialize bits [7:0] of the result.
4091/// \param __b30
4092/// An 8-bit integral value used to initialize bits [15:8] of the result.
4093/// \param __b29
4094/// An 8-bit integral value used to initialize bits [23:16] of the result.
4095/// \param __b28
4096/// An 8-bit integral value used to initialize bits [31:24] of the result.
4097/// \param __b27
4098/// An 8-bit integral value used to initialize bits [39:32] of the result.
4099/// \param __b26
4100/// An 8-bit integral value used to initialize bits [47:40] of the result.
4101/// \param __b25
4102/// An 8-bit integral value used to initialize bits [55:48] of the result.
4103/// \param __b24
4104/// An 8-bit integral value used to initialize bits [63:56] of the result.
4105/// \param __b23
4106/// An 8-bit integral value used to initialize bits [71:64] of the result.
4107/// \param __b22
4108/// An 8-bit integral value used to initialize bits [79:72] of the result.
4109/// \param __b21
4110/// An 8-bit integral value used to initialize bits [87:80] of the result.
4111/// \param __b20
4112/// An 8-bit integral value used to initialize bits [95:88] of the result.
4113/// \param __b19
4114/// An 8-bit integral value used to initialize bits [103:96] of the result.
4115/// \param __b18
4116/// An 8-bit integral value used to initialize bits [111:104] of the result.
4117/// \param __b17
4118/// An 8-bit integral value used to initialize bits [119:112] of the result.
4119/// \param __b16
4120/// An 8-bit integral value used to initialize bits [127:120] of the result.
4121/// \param __b15
4122/// An 8-bit integral value used to initialize bits [135:128] of the result.
4123/// \param __b14
4124/// An 8-bit integral value used to initialize bits [143:136] of the result.
4125/// \param __b13
4126/// An 8-bit integral value used to initialize bits [151:144] of the result.
4127/// \param __b12
4128/// An 8-bit integral value used to initialize bits [159:152] of the result.
4129/// \param __b11
4130/// An 8-bit integral value used to initialize bits [167:160] of the result.
4131/// \param __b10
4132/// An 8-bit integral value used to initialize bits [175:168] of the result.
4133/// \param __b09
4134/// An 8-bit integral value used to initialize bits [183:176] of the result.
4135/// \param __b08
4136/// An 8-bit integral value used to initialize bits [191:184] of the result.
4137/// \param __b07
4138/// An 8-bit integral value used to initialize bits [199:192] of the result.
4139/// \param __b06
4140/// An 8-bit integral value used to initialize bits [207:200] of the result.
4141/// \param __b05
4142/// An 8-bit integral value used to initialize bits [215:208] of the result.
4143/// \param __b04
4144/// An 8-bit integral value used to initialize bits [223:216] of the result.
4145/// \param __b03
4146/// An 8-bit integral value used to initialize bits [231:224] of the result.
4147/// \param __b02
4148/// An 8-bit integral value used to initialize bits [239:232] of the result.
4149/// \param __b01
4150/// An 8-bit integral value used to initialize bits [247:240] of the result.
4151/// \param __b00
4152/// An 8-bit integral value used to initialize bits [255:248] of the result.
4153/// \returns An initialized 256-bit integer vector.
4154static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4155_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4156 char __b27, char __b26, char __b25, char __b24,
4157 char __b23, char __b22, char __b21, char __b20,
4158 char __b19, char __b18, char __b17, char __b16,
4159 char __b15, char __b14, char __b13, char __b12,
4160 char __b11, char __b10, char __b09, char __b08,
4161 char __b07, char __b06, char __b05, char __b04,
4162 char __b03, char __b02, char __b01, char __b00)
4163{
4164 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4165 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4166 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4167 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4168}
4169
4170/// Constructs a 256-bit integer vector, initialized in reverse order
4171/// with the specified 64-bit integral values.
4172///
4173/// \headerfile <x86intrin.h>
4174///
4175/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4176/// instruction.
4177///
4178/// \param __a
4179/// A 64-bit integral value used to initialize bits [63:0] of the result.
4180/// \param __b
4181/// A 64-bit integral value used to initialize bits [127:64] of the result.
4182/// \param __c
4183/// A 64-bit integral value used to initialize bits [191:128] of the result.
4184/// \param __d
4185/// A 64-bit integral value used to initialize bits [255:192] of the result.
4186/// \returns An initialized 256-bit integer vector.
4187static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4188_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4189{
4190 return _mm256_set_epi64x(__d, __c, __b, __a);
4191}
4192
4193/* Create vectors with repeated elements */
4194/// Constructs a 256-bit floating-point vector of [4 x double], with each
4195/// of the four double-precision floating-point vector elements set to the
4196/// specified double-precision floating-point value.
4197///
4198/// \headerfile <x86intrin.h>
4199///
4200/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4201///
4202/// \param __w
4203/// A double-precision floating-point value used to initialize each vector
4204/// element of the result.
4205/// \returns An initialized 256-bit floating-point vector of [4 x double].
4206static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4208{
4209 return _mm256_set_pd(__w, __w, __w, __w);
4210}
4211
4212/// Constructs a 256-bit floating-point vector of [8 x float], with each
4213/// of the eight single-precision floating-point vector elements set to the
4214/// specified single-precision floating-point value.
4215///
4216/// \headerfile <x86intrin.h>
4217///
4218/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4219/// instruction.
4220///
4221/// \param __w
4222/// A single-precision floating-point value used to initialize each vector
4223/// element of the result.
4224/// \returns An initialized 256-bit floating-point vector of [8 x float].
4225static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4227{
4228 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4229}
4230
4231/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4232/// 32-bit integral vector elements set to the specified 32-bit integral
4233/// value.
4234///
4235/// \headerfile <x86intrin.h>
4236///
4237/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4238/// instruction.
4239///
4240/// \param __i
4241/// A 32-bit integral value used to initialize each vector element of the
4242/// result.
4243/// \returns An initialized 256-bit integer vector of [8 x i32].
4244static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4246{
4247 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4248}
4249
4250/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4251/// 16-bit integral vector elements set to the specified 16-bit integral
4252/// value.
4253///
4254/// \headerfile <x86intrin.h>
4255///
4256/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4257///
4258/// \param __w
4259/// A 16-bit integral value used to initialize each vector element of the
4260/// result.
4261/// \returns An initialized 256-bit integer vector of [16 x i16].
4262static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4264{
4265 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4266 __w, __w, __w, __w, __w, __w, __w, __w);
4267}
4268
4269/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4270/// 8-bit integral vector elements set to the specified 8-bit integral value.
4271///
4272/// \headerfile <x86intrin.h>
4273///
4274/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4275///
4276/// \param __b
4277/// An 8-bit integral value used to initialize each vector element of the
4278/// result.
4279/// \returns An initialized 256-bit integer vector of [32 x i8].
4280static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4282{
4283 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4284 __b, __b, __b, __b, __b, __b, __b, __b,
4285 __b, __b, __b, __b, __b, __b, __b, __b,
4286 __b, __b, __b, __b, __b, __b, __b, __b);
4287}
4288
4289/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4290/// 64-bit integral vector elements set to the specified 64-bit integral
4291/// value.
4292///
4293/// \headerfile <x86intrin.h>
4294///
4295/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4296///
4297/// \param __q
4298/// A 64-bit integral value used to initialize each vector element of the
4299/// result.
4300/// \returns An initialized 256-bit integer vector of [4 x i64].
4301static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4303{
4304 return _mm256_set_epi64x(__q, __q, __q, __q);
4305}
4306
4307/* Create __zeroed vectors */
4308/// Constructs a 256-bit floating-point vector of [4 x double] with all
4309/// vector elements initialized to zero.
4310///
4311/// \headerfile <x86intrin.h>
4312///
4313/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4314///
4315/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4317 return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
4318}
4319
4320/// Constructs a 256-bit floating-point vector of [8 x float] with all
4321/// vector elements initialized to zero.
4322///
4323/// \headerfile <x86intrin.h>
4324///
4325/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4326///
4327/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4329 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4330}
4331
4332/// Constructs a 256-bit integer vector initialized to zero.
4333///
4334/// \headerfile <x86intrin.h>
4335///
4336/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4337///
4338/// \returns A 256-bit integer vector initialized to zero.
4339static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4341 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4342}
4343
4344/* Cast between vector types */
4345/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4346/// floating-point vector of [8 x float].
4347///
4348/// \headerfile <x86intrin.h>
4349///
4350/// This intrinsic has no corresponding instruction.
4351///
4352/// \param __a
4353/// A 256-bit floating-point vector of [4 x double].
4354/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4355/// bitwise pattern as the parameter.
4356static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4358{
4359 return (__m256)__a;
4360}
4361
4362/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4363/// integer vector.
4364///
4365/// \headerfile <x86intrin.h>
4366///
4367/// This intrinsic has no corresponding instruction.
4368///
4369/// \param __a
4370/// A 256-bit floating-point vector of [4 x double].
4371/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4372/// parameter.
4373static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4375{
4376 return (__m256i)__a;
4377}
4378
4379/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4380/// floating-point vector of [4 x double].
4381///
4382/// \headerfile <x86intrin.h>
4383///
4384/// This intrinsic has no corresponding instruction.
4385///
4386/// \param __a
4387/// A 256-bit floating-point vector of [8 x float].
4388/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4389/// bitwise pattern as the parameter.
4390static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4392{
4393 return (__m256d)__a;
4394}
4395
4396/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4397/// integer vector.
4398///
4399/// \headerfile <x86intrin.h>
4400///
4401/// This intrinsic has no corresponding instruction.
4402///
4403/// \param __a
4404/// A 256-bit floating-point vector of [8 x float].
4405/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4406/// parameter.
4407static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4409{
4410 return (__m256i)__a;
4411}
4412
4413/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4414/// of [8 x float].
4415///
4416/// \headerfile <x86intrin.h>
4417///
4418/// This intrinsic has no corresponding instruction.
4419///
4420/// \param __a
4421/// A 256-bit integer vector.
4422/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4423/// bitwise pattern as the parameter.
4424static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4426{
4427 return (__m256)__a;
4428}
4429
4430/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4431/// of [4 x double].
4432///
4433/// \headerfile <x86intrin.h>
4434///
4435/// This intrinsic has no corresponding instruction.
4436///
4437/// \param __a
4438/// A 256-bit integer vector.
4439/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4440/// bitwise pattern as the parameter.
4441static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4443{
4444 return (__m256d)__a;
4445}
4446
4447/// Returns the lower 128 bits of a 256-bit floating-point vector of
4448/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4449///
4450/// \headerfile <x86intrin.h>
4451///
4452/// This intrinsic has no corresponding instruction.
4453///
4454/// \param __a
4455/// A 256-bit floating-point vector of [4 x double].
4456/// \returns A 128-bit floating-point vector of [2 x double] containing the
4457/// lower 128 bits of the parameter.
4458static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4460{
4461 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4462}
4463
4464/// Returns the lower 128 bits of a 256-bit floating-point vector of
4465/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4466///
4467/// \headerfile <x86intrin.h>
4468///
4469/// This intrinsic has no corresponding instruction.
4470///
4471/// \param __a
4472/// A 256-bit floating-point vector of [8 x float].
4473/// \returns A 128-bit floating-point vector of [4 x float] containing the
4474/// lower 128 bits of the parameter.
4475static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4477{
4478 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4479}
4480
4481/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4482///
4483/// \headerfile <x86intrin.h>
4484///
4485/// This intrinsic has no corresponding instruction.
4486///
4487/// \param __a
4488/// A 256-bit integer vector.
4489/// \returns A 128-bit integer vector containing the lower 128 bits of the
4490/// parameter.
4491static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4493{
4494 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4495}
4496
4497/// Constructs a 256-bit floating-point vector of [4 x double] from a
4498/// 128-bit floating-point vector of [2 x double].
4499///
4500/// The lower 128 bits contain the value of the source vector. The contents
4501/// of the upper 128 bits are undefined.
4502///
4503/// \headerfile <x86intrin.h>
4504///
4505/// This intrinsic has no corresponding instruction.
4506///
4507/// \param __a
4508/// A 128-bit vector of [2 x double].
4509/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4510/// contain the value of the parameter. The contents of the upper 128 bits
4511/// are undefined.
4512static __inline __m256d __DEFAULT_FN_ATTRS
4514{
4515 return __builtin_shufflevector(
4516 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4517}
4518
4519/// Constructs a 256-bit floating-point vector of [8 x float] from a
4520/// 128-bit floating-point vector of [4 x float].
4521///
4522/// The lower 128 bits contain the value of the source vector. The contents
4523/// of the upper 128 bits are undefined.
4524///
4525/// \headerfile <x86intrin.h>
4526///
4527/// This intrinsic has no corresponding instruction.
4528///
4529/// \param __a
4530/// A 128-bit vector of [4 x float].
4531/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4532/// contain the value of the parameter. The contents of the upper 128 bits
4533/// are undefined.
4534static __inline __m256 __DEFAULT_FN_ATTRS
4536{
4537 return __builtin_shufflevector((__v4sf)__a,
4538 (__v4sf)__builtin_nondeterministic_value(__a),
4539 0, 1, 2, 3, 4, 5, 6, 7);
4540}
4541
4542/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4543///
4544/// The lower 128 bits contain the value of the source vector. The contents
4545/// of the upper 128 bits are undefined.
4546///
4547/// \headerfile <x86intrin.h>
4548///
4549/// This intrinsic has no corresponding instruction.
4550///
4551/// \param __a
4552/// A 128-bit integer vector.
4553/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4554/// the parameter. The contents of the upper 128 bits are undefined.
4555static __inline __m256i __DEFAULT_FN_ATTRS
4557{
4558 return __builtin_shufflevector(
4559 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4560}
4561
4562/// Constructs a 256-bit floating-point vector of [4 x double] from a
4563/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4564/// contain the value of the source vector. The upper 128 bits are set
4565/// to zero.
4566///
4567/// \headerfile <x86intrin.h>
4568///
4569/// This intrinsic has no corresponding instruction.
4570///
4571/// \param __a
4572/// A 128-bit vector of [2 x double].
4573/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4574/// contain the value of the parameter. The upper 128 bits are set to zero.
4575static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4577 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4578}
4579
4580/// Constructs a 256-bit floating-point vector of [8 x float] from a
4581/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4582/// the value of the source vector. The upper 128 bits are set to zero.
4583///
4584/// \headerfile <x86intrin.h>
4585///
4586/// This intrinsic has no corresponding instruction.
4587///
4588/// \param __a
4589/// A 128-bit vector of [4 x float].
4590/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4591/// contain the value of the parameter. The upper 128 bits are set to zero.
4592static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4594 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4595}
4596
4597/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4598/// The lower 128 bits contain the value of the source vector. The upper
4599/// 128 bits are set to zero.
4600///
4601/// \headerfile <x86intrin.h>
4602///
4603/// This intrinsic has no corresponding instruction.
4604///
4605/// \param __a
4606/// A 128-bit integer vector.
4607/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4608/// the parameter. The upper 128 bits are set to zero.
4609static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4611 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4612}
4613
4614/*
4615 Vector insert.
4616 We use macros rather than inlines because we only want to accept
4617 invocations where the immediate M is a constant expression.
4618*/
4619/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4620/// a 256-bit vector of [8 x float] given in the first parameter, and then
4621/// replacing either the upper or the lower 128 bits with the contents of a
4622/// 128-bit vector of [4 x float] in the second parameter.
4623///
4624/// The immediate integer parameter determines between the upper or the lower
4625/// 128 bits.
4626///
4627/// \headerfile <x86intrin.h>
4628///
4629/// \code
4630/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4631/// \endcode
4632///
4633/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4634///
4635/// \param V1
4636/// A 256-bit vector of [8 x float]. This vector is copied to the result
4637/// first, and then either the upper or the lower 128 bits of the result will
4638/// be replaced by the contents of \a V2.
4639/// \param V2
4640/// A 128-bit vector of [4 x float]. The contents of this parameter are
4641/// written to either the upper or the lower 128 bits of the result depending
4642/// on the value of parameter \a M.
4643/// \param M
4644/// An immediate integer. The least significant bit determines how the values
4645/// from the two parameters are interleaved: \n
4646/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4647/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4648/// result. \n
4649/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4650/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4651/// result.
4652/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4653#define _mm256_insertf128_ps(V1, V2, M) \
4654 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4655 (__v4sf)(__m128)(V2), (int)(M)))
4656
4657/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4658/// a 256-bit vector of [4 x double] given in the first parameter, and then
4659/// replacing either the upper or the lower 128 bits with the contents of a
4660/// 128-bit vector of [2 x double] in the second parameter.
4661///
4662/// The immediate integer parameter determines between the upper or the lower
4663/// 128 bits.
4664///
4665/// \headerfile <x86intrin.h>
4666///
4667/// \code
4668/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4669/// \endcode
4670///
4671/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4672///
4673/// \param V1
4674/// A 256-bit vector of [4 x double]. This vector is copied to the result
4675/// first, and then either the upper or the lower 128 bits of the result will
4676/// be replaced by the contents of \a V2.
4677/// \param V2
4678/// A 128-bit vector of [2 x double]. The contents of this parameter are
4679/// written to either the upper or the lower 128 bits of the result depending
4680/// on the value of parameter \a M.
4681/// \param M
4682/// An immediate integer. The least significant bit determines how the values
4683/// from the two parameters are interleaved: \n
4684/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4685/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4686/// result. \n
4687/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4688/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4689/// result.
4690/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4691#define _mm256_insertf128_pd(V1, V2, M) \
4692 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4693 (__v2df)(__m128d)(V2), (int)(M)))
4694
4695/// Constructs a new 256-bit integer vector by first duplicating a
4696/// 256-bit integer vector given in the first parameter, and then replacing
4697/// either the upper or the lower 128 bits with the contents of a 128-bit
4698/// integer vector in the second parameter.
4699///
4700/// The immediate integer parameter determines between the upper or the lower
4701/// 128 bits.
4702///
4703/// \headerfile <x86intrin.h>
4704///
4705/// \code
4706/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4707/// \endcode
4708///
4709/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4710///
4711/// \param V1
4712/// A 256-bit integer vector. This vector is copied to the result first, and
4713/// then either the upper or the lower 128 bits of the result will be
4714/// replaced by the contents of \a V2.
4715/// \param V2
4716/// A 128-bit integer vector. The contents of this parameter are written to
4717/// either the upper or the lower 128 bits of the result depending on the
4718/// value of parameter \a M.
4719/// \param M
4720/// An immediate integer. The least significant bit determines how the values
4721/// from the two parameters are interleaved: \n
4722/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4723/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4724/// result. \n
4725/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4726/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4727/// result.
4728/// \returns A 256-bit integer vector containing the interleaved values.
4729#define _mm256_insertf128_si256(V1, V2, M) \
4730 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4731 (__v4si)(__m128i)(V2), (int)(M)))
4732
4733/*
4734 Vector extract.
4735 We use macros rather than inlines because we only want to accept
4736 invocations where the immediate M is a constant expression.
4737*/
4738/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4739/// of [8 x float], as determined by the immediate integer parameter, and
4740/// returns the extracted bits as a 128-bit vector of [4 x float].
4741///
4742/// \headerfile <x86intrin.h>
4743///
4744/// \code
4745/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4746/// \endcode
4747///
4748/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4749///
4750/// \param V
4751/// A 256-bit vector of [8 x float].
4752/// \param M
4753/// An immediate integer. The least significant bit determines which bits are
4754/// extracted from the first parameter: \n
4755/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4756/// result. \n
4757/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4758/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4759#define _mm256_extractf128_ps(V, M) \
4760 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4761
4762/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4763/// of [4 x double], as determined by the immediate integer parameter, and
4764/// returns the extracted bits as a 128-bit vector of [2 x double].
4765///
4766/// \headerfile <x86intrin.h>
4767///
4768/// \code
4769/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4770/// \endcode
4771///
4772/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4773///
4774/// \param V
4775/// A 256-bit vector of [4 x double].
4776/// \param M
4777/// An immediate integer. The least significant bit determines which bits are
4778/// extracted from the first parameter: \n
4779/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4780/// result. \n
4781/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4782/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4783#define _mm256_extractf128_pd(V, M) \
4784 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4785
4786/// Extracts either the upper or the lower 128 bits from a 256-bit
4787/// integer vector, as determined by the immediate integer parameter, and
4788/// returns the extracted bits as a 128-bit integer vector.
4789///
4790/// \headerfile <x86intrin.h>
4791///
4792/// \code
4793/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4794/// \endcode
4795///
4796/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4797///
4798/// \param V
4799/// A 256-bit integer vector.
4800/// \param M
4801/// An immediate integer. The least significant bit determines which bits are
4802/// extracted from the first parameter: \n
4803/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4804/// result. \n
4805/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4806/// \returns A 128-bit integer vector containing the extracted bits.
4807#define _mm256_extractf128_si256(V, M) \
4808 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4809
4810/// Constructs a 256-bit floating-point vector of [8 x float] by
4811/// concatenating two 128-bit floating-point vectors of [4 x float].
4812///
4813/// \headerfile <x86intrin.h>
4814///
4815/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4816///
4817/// \param __hi
4818/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4819/// 128 bits of the result.
4820/// \param __lo
4821/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4822/// 128 bits of the result.
4823/// \returns A 256-bit floating-point vector of [8 x float] containing the
4824/// concatenated result.
4825static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4826_mm256_set_m128(__m128 __hi, __m128 __lo) {
4827 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4828}
4829
4830/// Constructs a 256-bit floating-point vector of [4 x double] by
4831/// concatenating two 128-bit floating-point vectors of [2 x double].
4832///
4833/// \headerfile <x86intrin.h>
4834///
4835/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4836///
4837/// \param __hi
4838/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4839/// 128 bits of the result.
4840/// \param __lo
4841/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4842/// 128 bits of the result.
4843/// \returns A 256-bit floating-point vector of [4 x double] containing the
4844/// concatenated result.
4845static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4846_mm256_set_m128d(__m128d __hi, __m128d __lo) {
4847 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4848}
4849
4850/// Constructs a 256-bit integer vector by concatenating two 128-bit
4851/// integer vectors.
4852///
4853/// \headerfile <x86intrin.h>
4854///
4855/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4856///
4857/// \param __hi
4858/// A 128-bit integer vector to be copied to the upper 128 bits of the
4859/// result.
4860/// \param __lo
4861/// A 128-bit integer vector to be copied to the lower 128 bits of the
4862/// result.
4863/// \returns A 256-bit integer vector containing the concatenated result.
4864static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4865_mm256_set_m128i(__m128i __hi, __m128i __lo) {
4866 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4867}
4868
4869/// Constructs a 256-bit floating-point vector of [8 x float] by
4870/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4871/// similar to _mm256_set_m128, but the order of the input parameters is
4872/// swapped.
4873///
4874/// \headerfile <x86intrin.h>
4875///
4876/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4877///
4878/// \param __lo
4879/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4880/// 128 bits of the result.
4881/// \param __hi
4882/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4883/// 128 bits of the result.
4884/// \returns A 256-bit floating-point vector of [8 x float] containing the
4885/// concatenated result.
4886static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4887_mm256_setr_m128(__m128 __lo, __m128 __hi) {
4888 return _mm256_set_m128(__hi, __lo);
4889}
4890
4891/// Constructs a 256-bit floating-point vector of [4 x double] by
4892/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4893/// similar to _mm256_set_m128d, but the order of the input parameters is
4894/// swapped.
4895///
4896/// \headerfile <x86intrin.h>
4897///
4898/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4899///
4900/// \param __lo
4901/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4902/// 128 bits of the result.
4903/// \param __hi
4904/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4905/// 128 bits of the result.
4906/// \returns A 256-bit floating-point vector of [4 x double] containing the
4907/// concatenated result.
4908static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4909_mm256_setr_m128d(__m128d __lo, __m128d __hi) {
4910 return (__m256d)_mm256_set_m128d(__hi, __lo);
4911}
4912
4913/// Constructs a 256-bit integer vector by concatenating two 128-bit
4914/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4915/// the input parameters is swapped.
4916///
4917/// \headerfile <x86intrin.h>
4918///
4919/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4920///
4921/// \param __lo
4922/// A 128-bit integer vector to be copied to the lower 128 bits of the
4923/// result.
4924/// \param __hi
4925/// A 128-bit integer vector to be copied to the upper 128 bits of the
4926/// result.
4927/// \returns A 256-bit integer vector containing the concatenated result.
4928static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4929_mm256_setr_m128i(__m128i __lo, __m128i __hi) {
4930 return (__m256i)_mm256_set_m128i(__hi, __lo);
4931}
4932
4933/* SIMD load ops (unaligned) */
4934/// Loads two 128-bit floating-point vectors of [4 x float] from
4935/// unaligned memory locations and constructs a 256-bit floating-point vector
4936/// of [8 x float] by concatenating the two 128-bit vectors.
4937///
4938/// \headerfile <x86intrin.h>
4939///
4940/// This intrinsic corresponds to load instructions followed by the
4941/// <c> VINSERTF128 </c> instruction.
4942///
4943/// \param __addr_hi
4944/// A pointer to a 128-bit memory location containing 4 consecutive
4945/// single-precision floating-point values. These values are to be copied to
4946/// bits[255:128] of the result. The address of the memory location does not
4947/// have to be aligned.
4948/// \param __addr_lo
4949/// A pointer to a 128-bit memory location containing 4 consecutive
4950/// single-precision floating-point values. These values are to be copied to
4951/// bits[127:0] of the result. The address of the memory location does not
4952/// have to be aligned.
4953/// \returns A 256-bit floating-point vector of [8 x float] containing the
4954/// concatenated result.
4955static __inline __m256 __DEFAULT_FN_ATTRS
4956_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4957{
4958 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4959}
4960
4961/// Loads two 128-bit floating-point vectors of [2 x double] from
4962/// unaligned memory locations and constructs a 256-bit floating-point vector
4963/// of [4 x double] by concatenating the two 128-bit vectors.
4964///
4965/// \headerfile <x86intrin.h>
4966///
4967/// This intrinsic corresponds to load instructions followed by the
4968/// <c> VINSERTF128 </c> instruction.
4969///
4970/// \param __addr_hi
4971/// A pointer to a 128-bit memory location containing two consecutive
4972/// double-precision floating-point values. These values are to be copied to
4973/// bits[255:128] of the result. The address of the memory location does not
4974/// have to be aligned.
4975/// \param __addr_lo
4976/// A pointer to a 128-bit memory location containing two consecutive
4977/// double-precision floating-point values. These values are to be copied to
4978/// bits[127:0] of the result. The address of the memory location does not
4979/// have to be aligned.
4980/// \returns A 256-bit floating-point vector of [4 x double] containing the
4981/// concatenated result.
4982static __inline __m256d __DEFAULT_FN_ATTRS
4983_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4984{
4985 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
4986}
4987
4988/// Loads two 128-bit integer vectors from unaligned memory locations and
4989/// constructs a 256-bit integer vector by concatenating the two 128-bit
4990/// vectors.
4991///
4992/// \headerfile <x86intrin.h>
4993///
4994/// This intrinsic corresponds to load instructions followed by the
4995/// <c> VINSERTF128 </c> instruction.
4996///
4997/// \param __addr_hi
4998/// A pointer to a 128-bit memory location containing a 128-bit integer
4999/// vector. This vector is to be copied to bits[255:128] of the result. The
5000/// address of the memory location does not have to be aligned.
5001/// \param __addr_lo
5002/// A pointer to a 128-bit memory location containing a 128-bit integer
5003/// vector. This vector is to be copied to bits[127:0] of the result. The
5004/// address of the memory location does not have to be aligned.
5005/// \returns A 256-bit integer vector containing the concatenated result.
5006static __inline __m256i __DEFAULT_FN_ATTRS
5007_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
5008{
5009 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
5010}
5011
5012/* SIMD store ops (unaligned) */
5013/// Stores the upper and lower 128 bits of a 256-bit floating-point
5014/// vector of [8 x float] into two different unaligned memory locations.
5015///
5016/// \headerfile <x86intrin.h>
5017///
5018/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5019/// store instructions.
5020///
5021/// \param __addr_hi
5022/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5023/// copied to this memory location. The address of this memory location does
5024/// not have to be aligned.
5025/// \param __addr_lo
5026/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5027/// copied to this memory location. The address of this memory location does
5028/// not have to be aligned.
5029/// \param __a
5030/// A 256-bit floating-point vector of [8 x float].
5031static __inline void __DEFAULT_FN_ATTRS
5032_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
5033{
5034 __m128 __v128;
5035
5036 __v128 = _mm256_castps256_ps128(__a);
5037 _mm_storeu_ps(__addr_lo, __v128);
5038 __v128 = _mm256_extractf128_ps(__a, 1);
5039 _mm_storeu_ps(__addr_hi, __v128);
5040}
5041
5042/// Stores the upper and lower 128 bits of a 256-bit floating-point
5043/// vector of [4 x double] into two different unaligned memory locations.
5044///
5045/// \headerfile <x86intrin.h>
5046///
5047/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5048/// store instructions.
5049///
5050/// \param __addr_hi
5051/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5052/// copied to this memory location. The address of this memory location does
5053/// not have to be aligned.
5054/// \param __addr_lo
5055/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5056/// copied to this memory location. The address of this memory location does
5057/// not have to be aligned.
5058/// \param __a
5059/// A 256-bit floating-point vector of [4 x double].
5060static __inline void __DEFAULT_FN_ATTRS
5061_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5062{
5063 __m128d __v128;
5064
5065 __v128 = _mm256_castpd256_pd128(__a);
5066 _mm_storeu_pd(__addr_lo, __v128);
5067 __v128 = _mm256_extractf128_pd(__a, 1);
5068 _mm_storeu_pd(__addr_hi, __v128);
5069}
5070
5071/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5072/// two different unaligned memory locations.
5073///
5074/// \headerfile <x86intrin.h>
5075///
5076/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5077/// store instructions.
5078///
5079/// \param __addr_hi
5080/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5081/// copied to this memory location. The address of this memory location does
5082/// not have to be aligned.
5083/// \param __addr_lo
5084/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5085/// copied to this memory location. The address of this memory location does
5086/// not have to be aligned.
5087/// \param __a
5088/// A 256-bit integer vector.
5089static __inline void __DEFAULT_FN_ATTRS
5090_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5091{
5092 __m128i __v128;
5093
5094 __v128 = _mm256_castsi256_si128(__a);
5095 _mm_storeu_si128(__addr_lo, __v128);
5096 __v128 = _mm256_extractf128_si256(__a, 1);
5097 _mm_storeu_si128(__addr_hi, __v128);
5098}
5099
5100#undef __DEFAULT_FN_ATTRS
5101#undef __DEFAULT_FN_ATTRS_CONSTEXPR
5102#undef __DEFAULT_FN_ATTRS128
5103#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5104
5105#endif /* __AVXINTRIN_H */
__device__ _Float16
#define __DEFAULT_FN_ATTRS
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3049
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition avxintrin.h:3093
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:744
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition avxintrin.h:3299
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2921
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition avxintrin.h:3595
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition avxintrin.h:3113
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4576
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition avxintrin.h:2279
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition avxintrin.h:3263
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition avxintrin.h:3319
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition avxintrin.h:4956
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:356
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3409
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition avxintrin.h:581
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition avxintrin.h:3755
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4593
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition avxintrin.h:116
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:390
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition avxintrin.h:2967
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition avxintrin.h:2204
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition avxintrin.h:3644
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition avxintrin.h:306
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2869
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition avxintrin.h:3990
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4887
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3434
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3385
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition avxintrin.h:982
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition avxintrin.h:4374
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition avxintrin.h:188
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4929
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition avxintrin.h:3357
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition avxintrin.h:4759
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition avxintrin.h:4807
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition avxintrin.h:3206
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2948
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition avxintrin.h:4425
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition avxintrin.h:4357
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition avxintrin.h:3684
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition avxintrin.h:2374
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4909
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition avxintrin.h:132
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition avxintrin.h:1406
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition avxintrin.h:3575
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition avxintrin.h:891
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition avxintrin.h:3631
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:373
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition avxintrin.h:3803
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition avxintrin.h:2189
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition avxintrin.h:602
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition avxintrin.h:3507
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4513
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition avxintrin.h:4207
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2517
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into four signed truncated (rounded toward zero) 32-bit int...
Definition avxintrin.h:2259
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition avxintrin.h:3657
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition avxintrin.h:2239
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition avxintrin.h:2223
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition avxintrin.h:4226
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2491
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2605
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition avxintrin.h:288
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2839
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3149
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition avxintrin.h:2175
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition avxintrin.h:4783
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into eight signed truncated (rounded toward zero) 32-bit integers re...
Definition avxintrin.h:2299
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4535
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3071
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition avxintrin.h:246
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition avxintrin.h:4328
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2781
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition avxintrin.h:4245
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition avxintrin.h:3886
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition avxintrin.h:4983
static __inline float __DEFAULT_FN_ATTRS _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition avxintrin.h:2348
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition avxintrin.h:82
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:674
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2693
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition avxintrin.h:5061
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition avxintrin.h:339
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition avxintrin.h:151
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:721
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition avxintrin.h:170
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2722
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition avxintrin.h:4188
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition avxintrin.h:4302
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition avxintrin.h:3531
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition avxintrin.h:3483
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition avxintrin.h:4155
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition avxintrin.h:4442
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3166
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition avxintrin.h:3723
static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition avxintrin.h:4459
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2895
static __inline double __DEFAULT_FN_ATTRS _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition avxintrin.h:2315
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4610
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition avxintrin.h:98
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:698
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:767
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2751
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition avxintrin.h:4070
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition avxintrin.h:2399
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition avxintrin.h:2421
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition avxintrin.h:5090
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2546
static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition avxintrin.h:4476
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:620
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition avxintrin.h:4408
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:656
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:638
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition avxintrin.h:1433
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition avxintrin.h:5007
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition avxintrin.h:797
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition avxintrin.h:4316
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition avxintrin.h:3921
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3458
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition avxintrin.h:322
static __inline int __DEFAULT_FN_ATTRS _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition avxintrin.h:2331
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition avxintrin.h:3616
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition avxintrin.h:560
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3222
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition avxintrin.h:3340
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition avxintrin.h:267
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4826
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition avxintrin.h:4391
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition avxintrin.h:4340
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4556
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition avxintrin.h:204
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3186
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3027
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2663
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition avxintrin.h:4022
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition avxintrin.h:2985
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition avxintrin.h:2465
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition avxintrin.h:2444
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition avxintrin.h:4263
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition avxintrin.h:836
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition avxintrin.h:4281
static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition avxintrin.h:4492
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition avxintrin.h:225
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2634
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3243
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4865
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition avxintrin.h:3281
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition avxintrin.h:3950
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition avxintrin.h:5032
double __v4df __attribute__((__vector_size__(32)))
Definition avxintrin.h:17
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2575
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2810
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition avxintrin.h:3555
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4846
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3133
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition avxintrin.h:542
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition emmintrin.h:1619
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3878
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition emmintrin.h:3456
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition emmintrin.h:1980
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition emmintrin.h:3909
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:2100
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2021
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1863