clang 22.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
55 __min_vector_width__(256)))
56#define __DEFAULT_FN_ATTRS128 \
57 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
58 __min_vector_width__(128)))
59
60#if defined(__cplusplus) && (__cplusplus >= 201103L)
61#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
62#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
63#else
64#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
65#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
66#endif
67
68/* Arithmetic */
69/// Adds two 256-bit vectors of [4 x double].
70///
71/// \headerfile <x86intrin.h>
72///
73/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
74///
75/// \param __a
76/// A 256-bit vector of [4 x double] containing one of the source operands.
77/// \param __b
78/// A 256-bit vector of [4 x double] containing one of the source operands.
79/// \returns A 256-bit vector of [4 x double] containing the sums of both
80/// operands.
81static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
82_mm256_add_pd(__m256d __a, __m256d __b) {
83 return (__m256d)((__v4df)__a+(__v4df)__b);
84}
85
86/// Adds two 256-bit vectors of [8 x float].
87///
88/// \headerfile <x86intrin.h>
89///
90/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
91///
92/// \param __a
93/// A 256-bit vector of [8 x float] containing one of the source operands.
94/// \param __b
95/// A 256-bit vector of [8 x float] containing one of the source operands.
96/// \returns A 256-bit vector of [8 x float] containing the sums of both
97/// operands.
98static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a,
99 __m256 __b) {
100 return (__m256)((__v8sf)__a+(__v8sf)__b);
101}
102
103/// Subtracts two 256-bit vectors of [4 x double].
104///
105/// \headerfile <x86intrin.h>
106///
107/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
108///
109/// \param __a
110/// A 256-bit vector of [4 x double] containing the minuend.
111/// \param __b
112/// A 256-bit vector of [4 x double] containing the subtrahend.
113/// \returns A 256-bit vector of [4 x double] containing the differences between
114/// both operands.
115static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
116_mm256_sub_pd(__m256d __a, __m256d __b) {
117 return (__m256d)((__v4df)__a-(__v4df)__b);
118}
119
120/// Subtracts two 256-bit vectors of [8 x float].
121///
122/// \headerfile <x86intrin.h>
123///
124/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
125///
126/// \param __a
127/// A 256-bit vector of [8 x float] containing the minuend.
128/// \param __b
129/// A 256-bit vector of [8 x float] containing the subtrahend.
130/// \returns A 256-bit vector of [8 x float] containing the differences between
131/// both operands.
132static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a,
133 __m256 __b) {
134 return (__m256)((__v8sf)__a-(__v8sf)__b);
135}
136
137/// Adds the even-indexed values and subtracts the odd-indexed values of
138/// two 256-bit vectors of [4 x double].
139///
140/// \headerfile <x86intrin.h>
141///
142/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
143///
144/// \param __a
145/// A 256-bit vector of [4 x double] containing the left source operand.
146/// \param __b
147/// A 256-bit vector of [4 x double] containing the right source operand.
148/// \returns A 256-bit vector of [4 x double] containing the alternating sums
149/// and differences between both operands.
150static __inline __m256d __DEFAULT_FN_ATTRS
151_mm256_addsub_pd(__m256d __a, __m256d __b)
152{
153 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
154}
155
156/// Adds the even-indexed values and subtracts the odd-indexed values of
157/// two 256-bit vectors of [8 x float].
158///
159/// \headerfile <x86intrin.h>
160///
161/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
162///
163/// \param __a
164/// A 256-bit vector of [8 x float] containing the left source operand.
165/// \param __b
166/// A 256-bit vector of [8 x float] containing the right source operand.
167/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
168/// differences between both operands.
169static __inline __m256 __DEFAULT_FN_ATTRS
170_mm256_addsub_ps(__m256 __a, __m256 __b)
171{
172 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
173}
174
175/// Divides two 256-bit vectors of [4 x double].
176///
177/// \headerfile <x86intrin.h>
178///
179/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
180///
181/// \param __a
182/// A 256-bit vector of [4 x double] containing the dividend.
183/// \param __b
184/// A 256-bit vector of [4 x double] containing the divisor.
185/// \returns A 256-bit vector of [4 x double] containing the quotients of both
186/// operands.
187static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
188_mm256_div_pd(__m256d __a, __m256d __b) {
189 return (__m256d)((__v4df)__a/(__v4df)__b);
190}
191
192/// Divides two 256-bit vectors of [8 x float].
193///
194/// \headerfile <x86intrin.h>
195///
196/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
197///
198/// \param __a
199/// A 256-bit vector of [8 x float] containing the dividend.
200/// \param __b
201/// A 256-bit vector of [8 x float] containing the divisor.
202/// \returns A 256-bit vector of [8 x float] containing the quotients of both
203/// operands.
204static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a,
205 __m256 __b) {
206 return (__m256)((__v8sf)__a/(__v8sf)__b);
207}
208
209/// Compares two 256-bit vectors of [4 x double] and returns the greater
210/// of each pair of values.
211///
212/// If either value in a comparison is NaN, returns the value from \a __b.
213///
214/// \headerfile <x86intrin.h>
215///
216/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
217///
218/// \param __a
219/// A 256-bit vector of [4 x double] containing one of the operands.
220/// \param __b
221/// A 256-bit vector of [4 x double] containing one of the operands.
222/// \returns A 256-bit vector of [4 x double] containing the maximum values
223/// between both operands.
224static __inline __m256d __DEFAULT_FN_ATTRS
225_mm256_max_pd(__m256d __a, __m256d __b)
226{
227 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
228}
229
230/// Compares two 256-bit vectors of [8 x float] and returns the greater
231/// of each pair of values.
232///
233/// If either value in a comparison is NaN, returns the value from \a __b.
234///
235/// \headerfile <x86intrin.h>
236///
237/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
238///
239/// \param __a
240/// A 256-bit vector of [8 x float] containing one of the operands.
241/// \param __b
242/// A 256-bit vector of [8 x float] containing one of the operands.
243/// \returns A 256-bit vector of [8 x float] containing the maximum values
244/// between both operands.
245static __inline __m256 __DEFAULT_FN_ATTRS
246_mm256_max_ps(__m256 __a, __m256 __b)
247{
248 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
249}
250
251/// Compares two 256-bit vectors of [4 x double] and returns the lesser
252/// of each pair of values.
253///
254/// If either value in a comparison is NaN, returns the value from \a __b.
255///
256/// \headerfile <x86intrin.h>
257///
258/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
259///
260/// \param __a
261/// A 256-bit vector of [4 x double] containing one of the operands.
262/// \param __b
263/// A 256-bit vector of [4 x double] containing one of the operands.
264/// \returns A 256-bit vector of [4 x double] containing the minimum values
265/// between both operands.
266static __inline __m256d __DEFAULT_FN_ATTRS
267_mm256_min_pd(__m256d __a, __m256d __b)
268{
269 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
270}
271
272/// Compares two 256-bit vectors of [8 x float] and returns the lesser
273/// of each pair of values.
274///
275/// If either value in a comparison is NaN, returns the value from \a __b.
276///
277/// \headerfile <x86intrin.h>
278///
279/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
280///
281/// \param __a
282/// A 256-bit vector of [8 x float] containing one of the operands.
283/// \param __b
284/// A 256-bit vector of [8 x float] containing one of the operands.
285/// \returns A 256-bit vector of [8 x float] containing the minimum values
286/// between both operands.
287static __inline __m256 __DEFAULT_FN_ATTRS
288_mm256_min_ps(__m256 __a, __m256 __b)
289{
290 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
291}
292
293/// Multiplies two 256-bit vectors of [4 x double].
294///
295/// \headerfile <x86intrin.h>
296///
297/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
298///
299/// \param __a
300/// A 256-bit vector of [4 x double] containing one of the operands.
301/// \param __b
302/// A 256-bit vector of [4 x double] containing one of the operands.
303/// \returns A 256-bit vector of [4 x double] containing the products of both
304/// operands.
305static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
306_mm256_mul_pd(__m256d __a, __m256d __b) {
307 return (__m256d)((__v4df)__a * (__v4df)__b);
308}
309
310/// Multiplies two 256-bit vectors of [8 x float].
311///
312/// \headerfile <x86intrin.h>
313///
314/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
315///
316/// \param __a
317/// A 256-bit vector of [8 x float] containing one of the operands.
318/// \param __b
319/// A 256-bit vector of [8 x float] containing one of the operands.
320/// \returns A 256-bit vector of [8 x float] containing the products of both
321/// operands.
322static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
323 __m256 __b) {
324 return (__m256)((__v8sf)__a * (__v8sf)__b);
325}
326
327/// Calculates the square roots of the values in a 256-bit vector of
328/// [4 x double].
329///
330/// \headerfile <x86intrin.h>
331///
332/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
333///
334/// \param __a
335/// A 256-bit vector of [4 x double].
336/// \returns A 256-bit vector of [4 x double] containing the square roots of the
337/// values in the operand.
338static __inline __m256d __DEFAULT_FN_ATTRS
340{
341 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
342}
343
344/// Calculates the square roots of the values in a 256-bit vector of
345/// [8 x float].
346///
347/// \headerfile <x86intrin.h>
348///
349/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
350///
351/// \param __a
352/// A 256-bit vector of [8 x float].
353/// \returns A 256-bit vector of [8 x float] containing the square roots of the
354/// values in the operand.
355static __inline __m256 __DEFAULT_FN_ATTRS
357{
358 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
359}
360
361/// Calculates the reciprocal square roots of the values in a 256-bit
362/// vector of [8 x float].
363///
364/// \headerfile <x86intrin.h>
365///
366/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
367///
368/// \param __a
369/// A 256-bit vector of [8 x float].
370/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
371/// roots of the values in the operand.
372static __inline __m256 __DEFAULT_FN_ATTRS
374{
375 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
376}
377
378/// Calculates the reciprocals of the values in a 256-bit vector of
379/// [8 x float].
380///
381/// \headerfile <x86intrin.h>
382///
383/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
384///
385/// \param __a
386/// A 256-bit vector of [8 x float].
387/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
388/// values in the operand.
389static __inline __m256 __DEFAULT_FN_ATTRS
391{
392 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
393}
394
395/// Rounds the values in a 256-bit vector of [4 x double] as specified
396/// by the byte operand. The source values are rounded to integer values and
397/// returned as 64-bit double-precision floating-point values.
398///
399/// \headerfile <x86intrin.h>
400///
401/// \code
402/// __m256d _mm256_round_pd(__m256d V, const int M);
403/// \endcode
404///
405/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
406///
407/// \param V
408/// A 256-bit vector of [4 x double].
409/// \param M
410/// An integer value that specifies the rounding operation. \n
411/// Bits [7:4] are reserved. \n
412/// Bit [3] is a precision exception value: \n
413/// 0: A normal PE exception is used. \n
414/// 1: The PE field is not updated. \n
415/// Bit [2] is the rounding control source: \n
416/// 0: Use bits [1:0] of \a M. \n
417/// 1: Use the current MXCSR setting. \n
418/// Bits [1:0] contain the rounding control definition: \n
419/// 00: Nearest. \n
420/// 01: Downward (toward negative infinity). \n
421/// 10: Upward (toward positive infinity). \n
422/// 11: Truncated.
423/// \returns A 256-bit vector of [4 x double] containing the rounded values.
424#define _mm256_round_pd(V, M) \
425 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
426
427/// Rounds the values stored in a 256-bit vector of [8 x float] as
428/// specified by the byte operand. The source values are rounded to integer
429/// values and returned as floating-point values.
430///
431/// \headerfile <x86intrin.h>
432///
433/// \code
434/// __m256 _mm256_round_ps(__m256 V, const int M);
435/// \endcode
436///
437/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
438///
439/// \param V
440/// A 256-bit vector of [8 x float].
441/// \param M
442/// An integer value that specifies the rounding operation. \n
443/// Bits [7:4] are reserved. \n
444/// Bit [3] is a precision exception value: \n
445/// 0: A normal PE exception is used. \n
446/// 1: The PE field is not updated. \n
447/// Bit [2] is the rounding control source: \n
448/// 0: Use bits [1:0] of \a M. \n
449/// 1: Use the current MXCSR setting. \n
450/// Bits [1:0] contain the rounding control definition: \n
451/// 00: Nearest. \n
452/// 01: Downward (toward negative infinity). \n
453/// 10: Upward (toward positive infinity). \n
454/// 11: Truncated.
455/// \returns A 256-bit vector of [8 x float] containing the rounded values.
456#define _mm256_round_ps(V, M) \
457 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
458
459/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
460/// source values are rounded up to integer values and returned as 64-bit
461/// double-precision floating-point values.
462///
463/// \headerfile <x86intrin.h>
464///
465/// \code
466/// __m256d _mm256_ceil_pd(__m256d V);
467/// \endcode
468///
469/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
470///
471/// \param V
472/// A 256-bit vector of [4 x double].
473/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
474#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
475
476/// Rounds down the values stored in a 256-bit vector of [4 x double].
477/// The source values are rounded down to integer values and returned as
478/// 64-bit double-precision floating-point values.
479///
480/// \headerfile <x86intrin.h>
481///
482/// \code
483/// __m256d _mm256_floor_pd(__m256d V);
484/// \endcode
485///
486/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
487///
488/// \param V
489/// A 256-bit vector of [4 x double].
490/// \returns A 256-bit vector of [4 x double] containing the rounded down
491/// values.
492#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
493
494/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
495/// source values are rounded up to integer values and returned as
496/// floating-point values.
497///
498/// \headerfile <x86intrin.h>
499///
500/// \code
501/// __m256 _mm256_ceil_ps(__m256 V);
502/// \endcode
503///
504/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
505///
506/// \param V
507/// A 256-bit vector of [8 x float].
508/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
509#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
510
511/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
512/// source values are rounded down to integer values and returned as
513/// floating-point values.
514///
515/// \headerfile <x86intrin.h>
516///
517/// \code
518/// __m256 _mm256_floor_ps(__m256 V);
519/// \endcode
520///
521/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
522///
523/// \param V
524/// A 256-bit vector of [8 x float].
525/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
526#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
527
528/* Logical */
529/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
530///
531/// \headerfile <x86intrin.h>
532///
533/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
534///
535/// \param __a
536/// A 256-bit vector of [4 x double] containing one of the source operands.
537/// \param __b
538/// A 256-bit vector of [4 x double] containing one of the source operands.
539/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
540/// values between both operands.
541static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
542_mm256_and_pd(__m256d __a, __m256d __b)
543{
544 return (__m256d)((__v4du)__a & (__v4du)__b);
545}
546
547/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
548///
549/// \headerfile <x86intrin.h>
550///
551/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
552///
553/// \param __a
554/// A 256-bit vector of [8 x float] containing one of the source operands.
555/// \param __b
556/// A 256-bit vector of [8 x float] containing one of the source operands.
557/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
558/// values between both operands.
559static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
560_mm256_and_ps(__m256 __a, __m256 __b)
561{
562 return (__m256)((__v8su)__a & (__v8su)__b);
563}
564
565/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
566/// the one's complement of the values contained in the first source operand.
567///
568/// \headerfile <x86intrin.h>
569///
570/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
571///
572/// \param __a
573/// A 256-bit vector of [4 x double] containing the left source operand. The
574/// one's complement of this value is used in the bitwise AND.
575/// \param __b
576/// A 256-bit vector of [4 x double] containing the right source operand.
577/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
578/// values of the second operand and the one's complement of the first
579/// operand.
580static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
581_mm256_andnot_pd(__m256d __a, __m256d __b)
582{
583 return (__m256d)(~(__v4du)__a & (__v4du)__b);
584}
585
586/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
587/// the one's complement of the values contained in the first source operand.
588///
589/// \headerfile <x86intrin.h>
590///
591/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
592///
593/// \param __a
594/// A 256-bit vector of [8 x float] containing the left source operand. The
595/// one's complement of this value is used in the bitwise AND.
596/// \param __b
597/// A 256-bit vector of [8 x float] containing the right source operand.
598/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
599/// values of the second operand and the one's complement of the first
600/// operand.
601static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
602_mm256_andnot_ps(__m256 __a, __m256 __b)
603{
604 return (__m256)(~(__v8su)__a & (__v8su)__b);
605}
606
607/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
608///
609/// \headerfile <x86intrin.h>
610///
611/// This intrinsic corresponds to the <c> VORPD </c> instruction.
612///
613/// \param __a
614/// A 256-bit vector of [4 x double] containing one of the source operands.
615/// \param __b
616/// A 256-bit vector of [4 x double] containing one of the source operands.
617/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
618/// values between both operands.
619static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
620_mm256_or_pd(__m256d __a, __m256d __b)
621{
622 return (__m256d)((__v4du)__a | (__v4du)__b);
623}
624
625/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
626///
627/// \headerfile <x86intrin.h>
628///
629/// This intrinsic corresponds to the <c> VORPS </c> instruction.
630///
631/// \param __a
632/// A 256-bit vector of [8 x float] containing one of the source operands.
633/// \param __b
634/// A 256-bit vector of [8 x float] containing one of the source operands.
635/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
636/// values between both operands.
637static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
638_mm256_or_ps(__m256 __a, __m256 __b)
639{
640 return (__m256)((__v8su)__a | (__v8su)__b);
641}
642
643/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
644///
645/// \headerfile <x86intrin.h>
646///
647/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
648///
649/// \param __a
650/// A 256-bit vector of [4 x double] containing one of the source operands.
651/// \param __b
652/// A 256-bit vector of [4 x double] containing one of the source operands.
653/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
654/// values between both operands.
655static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
656_mm256_xor_pd(__m256d __a, __m256d __b)
657{
658 return (__m256d)((__v4du)__a ^ (__v4du)__b);
659}
660
661/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
662///
663/// \headerfile <x86intrin.h>
664///
665/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
666///
667/// \param __a
668/// A 256-bit vector of [8 x float] containing one of the source operands.
669/// \param __b
670/// A 256-bit vector of [8 x float] containing one of the source operands.
671/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
672/// values between both operands.
673static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
674_mm256_xor_ps(__m256 __a, __m256 __b)
675{
676 return (__m256)((__v8su)__a ^ (__v8su)__b);
677}
678
679/* Horizontal arithmetic */
680/// Horizontally adds the adjacent pairs of values contained in two
681/// 256-bit vectors of [4 x double].
682///
683/// \headerfile <x86intrin.h>
684///
685/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
686///
687/// \param __a
688/// A 256-bit vector of [4 x double] containing one of the source operands.
689/// The horizontal sums of the values are returned in the even-indexed
690/// elements of a vector of [4 x double].
691/// \param __b
692/// A 256-bit vector of [4 x double] containing one of the source operands.
693/// The horizontal sums of the values are returned in the odd-indexed
694/// elements of a vector of [4 x double].
695/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
696/// both operands.
697static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
698_mm256_hadd_pd(__m256d __a, __m256d __b) {
699 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
700}
701
702/// Horizontally adds the adjacent pairs of values contained in two
703/// 256-bit vectors of [8 x float].
704///
705/// \headerfile <x86intrin.h>
706///
707/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
708///
709/// \param __a
710/// A 256-bit vector of [8 x float] containing one of the source operands.
711/// The horizontal sums of the values are returned in the elements with
712/// index 0, 1, 4, 5 of a vector of [8 x float].
713/// \param __b
714/// A 256-bit vector of [8 x float] containing one of the source operands.
715/// The horizontal sums of the values are returned in the elements with
716/// index 2, 3, 6, 7 of a vector of [8 x float].
717/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
718/// both operands.
719static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a,
720 __m256 __b) {
721 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
722}
723
724/// Horizontally subtracts the adjacent pairs of values contained in two
725/// 256-bit vectors of [4 x double].
726///
727/// \headerfile <x86intrin.h>
728///
729/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
730///
731/// \param __a
732/// A 256-bit vector of [4 x double] containing one of the source operands.
733/// The horizontal differences between the values are returned in the
734/// even-indexed elements of a vector of [4 x double].
735/// \param __b
736/// A 256-bit vector of [4 x double] containing one of the source operands.
737/// The horizontal differences between the values are returned in the
738/// odd-indexed elements of a vector of [4 x double].
739/// \returns A 256-bit vector of [4 x double] containing the horizontal
740/// differences of both operands.
741static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
742_mm256_hsub_pd(__m256d __a, __m256d __b) {
743 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
744}
745
746/// Horizontally subtracts the adjacent pairs of values contained in two
747/// 256-bit vectors of [8 x float].
748///
749/// \headerfile <x86intrin.h>
750///
751/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
752///
753/// \param __a
754/// A 256-bit vector of [8 x float] containing one of the source operands.
755/// The horizontal differences between the values are returned in the
756/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
757/// \param __b
758/// A 256-bit vector of [8 x float] containing one of the source operands.
759/// The horizontal differences between the values are returned in the
760/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
761/// \returns A 256-bit vector of [8 x float] containing the horizontal
762/// differences of both operands.
763static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a,
764 __m256 __b) {
765 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
766}
767
768/* Vector permutations */
769/// Copies the values in a 128-bit vector of [2 x double] as specified
770/// by the 128-bit integer vector operand.
771///
772/// \headerfile <x86intrin.h>
773///
774/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
775///
776/// \param __a
777/// A 128-bit vector of [2 x double].
778/// \param __c
779/// A 128-bit integer vector operand specifying how the values are to be
780/// copied. \n
781/// Bit [1]: \n
782/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
783/// vector. \n
784/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
785/// returned vector. \n
786/// Bit [65]: \n
787/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
788/// returned vector. \n
789/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
790/// returned vector.
791/// \returns A 128-bit vector of [2 x double] containing the copied values.
792static __inline __m128d __DEFAULT_FN_ATTRS128
793_mm_permutevar_pd(__m128d __a, __m128i __c)
794{
795 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
796}
797
798/// Copies the values in a 256-bit vector of [4 x double] as specified
799/// by the 256-bit integer vector operand.
800///
801/// \headerfile <x86intrin.h>
802///
803/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
804///
805/// \param __a
806/// A 256-bit vector of [4 x double].
807/// \param __c
808/// A 256-bit integer vector operand specifying how the values are to be
809/// copied. \n
810/// Bit [1]: \n
811/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
812/// vector. \n
813/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
814/// returned vector. \n
815/// Bit [65]: \n
816/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
817/// returned vector. \n
818/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
819/// returned vector. \n
820/// Bit [129]: \n
821/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
822/// returned vector. \n
823/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
824/// returned vector. \n
825/// Bit [193]: \n
826/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
827/// returned vector. \n
828/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
829/// returned vector.
830/// \returns A 256-bit vector of [4 x double] containing the copied values.
831static __inline __m256d __DEFAULT_FN_ATTRS
832_mm256_permutevar_pd(__m256d __a, __m256i __c)
833{
834 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
835}
836
837/// Copies the values stored in a 128-bit vector of [4 x float] as
838/// specified by the 128-bit integer vector operand.
839///
840/// \headerfile <x86intrin.h>
841///
842/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
843///
844/// \param __a
845/// A 128-bit vector of [4 x float].
846/// \param __c
847/// A 128-bit integer vector operand specifying how the values are to be
848/// copied. \n
849/// Bits [1:0]: \n
850/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
851/// returned vector. \n
852/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
853/// returned vector. \n
854/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
855/// returned vector. \n
856/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
857/// returned vector. \n
858/// Bits [33:32]: \n
859/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
860/// returned vector. \n
861/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
862/// returned vector. \n
863/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
864/// returned vector. \n
865/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
866/// returned vector. \n
867/// Bits [65:64]: \n
868/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
869/// returned vector. \n
870/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
871/// returned vector. \n
872/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
873/// returned vector. \n
874/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
875/// returned vector. \n
876/// Bits [97:96]: \n
877/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
878/// returned vector. \n
879/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
880/// returned vector. \n
881/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
882/// returned vector. \n
883/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
884/// returned vector.
885/// \returns A 128-bit vector of [4 x float] containing the copied values.
886static __inline __m128 __DEFAULT_FN_ATTRS128
887_mm_permutevar_ps(__m128 __a, __m128i __c)
888{
889 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
890}
891
892/// Copies the values stored in a 256-bit vector of [8 x float] as
893/// specified by the 256-bit integer vector operand.
894///
895/// \headerfile <x86intrin.h>
896///
897/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
898///
899/// \param __a
900/// A 256-bit vector of [8 x float].
901/// \param __c
902/// A 256-bit integer vector operand specifying how the values are to be
903/// copied. \n
904/// Bits [1:0]: \n
905/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
906/// returned vector. \n
907/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
908/// returned vector. \n
909/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
910/// returned vector. \n
911/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
912/// returned vector. \n
913/// Bits [33:32]: \n
914/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
915/// returned vector. \n
916/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
917/// returned vector. \n
918/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
919/// returned vector. \n
920/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
921/// returned vector. \n
922/// Bits [65:64]: \n
923/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
924/// returned vector. \n
925/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
926/// returned vector. \n
927/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
928/// returned vector. \n
929/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
930/// returned vector. \n
931/// Bits [97:96]: \n
932/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
933/// returned vector. \n
934/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
935/// returned vector. \n
936/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
937/// returned vector. \n
938/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
939/// returned vector. \n
940/// Bits [129:128]: \n
941/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
942/// returned vector. \n
943/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
944/// returned vector. \n
945/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
946/// returned vector. \n
947/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
948/// returned vector. \n
949/// Bits [161:160]: \n
950/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
951/// returned vector. \n
952/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
953/// returned vector. \n
954/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
955/// returned vector. \n
956/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
957/// returned vector. \n
958/// Bits [193:192]: \n
959/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
960/// returned vector. \n
961/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
962/// returned vector. \n
963/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
964/// returned vector. \n
965/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
966/// returned vector. \n
967/// Bits [225:224]: \n
968/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
969/// returned vector. \n
970/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
971/// returned vector. \n
972/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
973/// returned vector. \n
974/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
975/// returned vector.
976/// \returns A 256-bit vector of [8 x float] containing the copied values.
977static __inline __m256 __DEFAULT_FN_ATTRS
979{
980 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
981}
982
983/// Copies the values in a 128-bit vector of [2 x double] as specified
984/// by the immediate integer operand.
985///
986/// \headerfile <x86intrin.h>
987///
988/// \code
989/// __m128d _mm_permute_pd(__m128d A, const int C);
990/// \endcode
991///
992/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
993///
994/// \param A
995/// A 128-bit vector of [2 x double].
996/// \param C
997/// An immediate integer operand specifying how the values are to be
998/// copied. \n
999/// Bit [0]: \n
1000/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1001/// vector. \n
1002/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1003/// returned vector. \n
1004/// Bit [1]: \n
1005/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1006/// returned vector. \n
1007/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1008/// returned vector.
1009/// \returns A 128-bit vector of [2 x double] containing the copied values.
1010#define _mm_permute_pd(A, C) \
1011 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1012
1013/// Copies the values in a 256-bit vector of [4 x double] as specified by
1014/// the immediate integer operand.
1015///
1016/// \headerfile <x86intrin.h>
1017///
1018/// \code
1019/// __m256d _mm256_permute_pd(__m256d A, const int C);
1020/// \endcode
1021///
1022/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1023///
1024/// \param A
1025/// A 256-bit vector of [4 x double].
1026/// \param C
1027/// An immediate integer operand specifying how the values are to be
1028/// copied. \n
1029/// Bit [0]: \n
1030/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1031/// vector. \n
1032/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1033/// returned vector. \n
1034/// Bit [1]: \n
1035/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1036/// returned vector. \n
1037/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1038/// returned vector. \n
1039/// Bit [2]: \n
1040/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1041/// returned vector. \n
1042/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1043/// returned vector. \n
1044/// Bit [3]: \n
1045/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1046/// returned vector. \n
1047/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1048/// returned vector.
1049/// \returns A 256-bit vector of [4 x double] containing the copied values.
1050#define _mm256_permute_pd(A, C) \
1051 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1052
1053/// Copies the values in a 128-bit vector of [4 x float] as specified by
1054/// the immediate integer operand.
1055///
1056/// \headerfile <x86intrin.h>
1057///
1058/// \code
1059/// __m128 _mm_permute_ps(__m128 A, const int C);
1060/// \endcode
1061///
1062/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1063///
1064/// \param A
1065/// A 128-bit vector of [4 x float].
1066/// \param C
1067/// An immediate integer operand specifying how the values are to be
1068/// copied. \n
1069/// Bits [1:0]: \n
1070/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1071/// returned vector. \n
1072/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1073/// returned vector. \n
1074/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1075/// returned vector. \n
1076/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1077/// returned vector. \n
1078/// Bits [3:2]: \n
1079/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1080/// returned vector. \n
1081/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1082/// returned vector. \n
1083/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1084/// returned vector. \n
1085/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1086/// returned vector. \n
1087/// Bits [5:4]: \n
1088/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1089/// returned vector. \n
1090/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1091/// returned vector. \n
1092/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1093/// returned vector. \n
1094/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1095/// returned vector. \n
1096/// Bits [7:6]: \n
1097/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1098/// returned vector. \n
1099/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1100/// returned vector. \n
1101/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1102/// returned vector. \n
1103/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1104/// returned vector.
1105/// \returns A 128-bit vector of [4 x float] containing the copied values.
1106#define _mm_permute_ps(A, C) \
1107 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1108
1109/// Copies the values in a 256-bit vector of [8 x float] as specified by
1110/// the immediate integer operand.
1111///
1112/// \headerfile <x86intrin.h>
1113///
1114/// \code
1115/// __m256 _mm256_permute_ps(__m256 A, const int C);
1116/// \endcode
1117///
1118/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1119///
1120/// \param A
1121/// A 256-bit vector of [8 x float].
1122/// \param C
1123/// An immediate integer operand specifying how the values are to be
1124/// copied. \n
1125/// Bits [1:0]: \n
1126/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1127/// returned vector. \n
1128/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1129/// returned vector. \n
1130/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1131/// returned vector. \n
1132/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1133/// returned vector. \n
1134/// Bits [3:2]: \n
1135/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1136/// returned vector. \n
1137/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1138/// returned vector. \n
1139/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1140/// returned vector. \n
1141/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1142/// returned vector. \n
1143/// Bits [5:4]: \n
1144/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1145/// returned vector. \n
1146/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1147/// returned vector. \n
1148/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1149/// returned vector. \n
1150/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1151/// returned vector. \n
1152/// Bits [7:6]: \n
1153/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1154/// returned vector. \n
1155/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1156/// returned vector. \n
1157/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1158/// returned vector. \n
1159/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1160/// returned vector. \n
1161/// Bits [1:0]: \n
1162/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1163/// returned vector. \n
1164/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1165/// returned vector. \n
1166/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1167/// returned vector. \n
1168/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1169/// returned vector. \n
1170/// Bits [3:2]: \n
1171/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1172/// returned vector. \n
1173/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1174/// returned vector. \n
1175/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1176/// returned vector. \n
1177/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1178/// returned vector. \n
1179/// Bits [5:4]: \n
1180/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1181/// returned vector. \n
1182/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1183/// returned vector. \n
1184/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1185/// returned vector. \n
1186/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1187/// returned vector. \n
1188/// Bits [7:6]: \n
1189/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1190/// returned vector. \n
1191/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1192/// returned vector. \n
1193/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1194/// returned vector. \n
1195/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1196/// returned vector.
1197/// \returns A 256-bit vector of [8 x float] containing the copied values.
1198#define _mm256_permute_ps(A, C) \
1199 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1200
1201/// Permutes 128-bit data values stored in two 256-bit vectors of
1202/// [4 x double], as specified by the immediate integer operand.
1203///
1204/// \headerfile <x86intrin.h>
1205///
1206/// \code
1207/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1208/// \endcode
1209///
1210/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1211///
1212/// \param V1
1213/// A 256-bit vector of [4 x double].
1214/// \param V2
1215/// A 256-bit vector of [4 x double.
1216/// \param M
1217/// An immediate integer operand specifying how the values are to be
1218/// permuted. \n
1219/// Bits [1:0]: \n
1220/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1221/// destination. \n
1222/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1223/// destination. \n
1224/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1225/// destination. \n
1226/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1227/// destination. \n
1228/// Bits [5:4]: \n
1229/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1230/// destination. \n
1231/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1232/// destination. \n
1233/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1234/// destination. \n
1235/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1236/// destination.
1237/// \returns A 256-bit vector of [4 x double] containing the copied values.
1238#define _mm256_permute2f128_pd(V1, V2, M) \
1239 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1240 (__v4df)(__m256d)(V2), (int)(M)))
1241
1242/// Permutes 128-bit data values stored in two 256-bit vectors of
1243/// [8 x float], as specified by the immediate integer operand.
1244///
1245/// \headerfile <x86intrin.h>
1246///
1247/// \code
1248/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1249/// \endcode
1250///
1251/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1252///
1253/// \param V1
1254/// A 256-bit vector of [8 x float].
1255/// \param V2
1256/// A 256-bit vector of [8 x float].
1257/// \param M
1258/// An immediate integer operand specifying how the values are to be
1259/// permuted. \n
1260/// Bits [1:0]: \n
1261/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1262/// destination. \n
1263/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1264/// destination. \n
1265/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1266/// destination. \n
1267/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1268/// destination. \n
1269/// Bits [5:4]: \n
1270/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1271/// destination. \n
1272/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1273/// destination. \n
1274/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1275/// destination. \n
1276/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1277/// destination.
1278/// \returns A 256-bit vector of [8 x float] containing the copied values.
1279#define _mm256_permute2f128_ps(V1, V2, M) \
1280 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1281 (__v8sf)(__m256)(V2), (int)(M)))
1282
1283/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1284/// as specified by the immediate integer operand.
1285///
1286/// \headerfile <x86intrin.h>
1287///
1288/// \code
1289/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1290/// \endcode
1291///
1292/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1293///
1294/// \param V1
1295/// A 256-bit integer vector.
1296/// \param V2
1297/// A 256-bit integer vector.
1298/// \param M
1299/// An immediate integer operand specifying how the values are to be copied.
1300/// Bits [1:0]: \n
1301/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1302/// destination. \n
1303/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1304/// destination. \n
1305/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1306/// destination. \n
1307/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1308/// destination. \n
1309/// Bits [5:4]: \n
1310/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1311/// destination. \n
1312/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1313/// destination. \n
1314/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1315/// destination. \n
1316/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1317/// destination.
1318/// \returns A 256-bit integer vector containing the copied values.
1319#define _mm256_permute2f128_si256(V1, V2, M) \
1320 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1321 (__v8si)(__m256i)(V2), (int)(M)))
1322
1323/* Vector Blend */
1324/// Merges 64-bit double-precision data values stored in either of the
1325/// two 256-bit vectors of [4 x double], as specified by the immediate
1326/// integer operand.
1327///
1328/// \headerfile <x86intrin.h>
1329///
1330/// \code
1331/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1332/// \endcode
1333///
1334/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1335///
1336/// \param V1
1337/// A 256-bit vector of [4 x double].
1338/// \param V2
1339/// A 256-bit vector of [4 x double].
1340/// \param M
1341/// An immediate integer operand, with mask bits [3:0] specifying how the
1342/// values are to be copied. The position of the mask bit corresponds to the
1343/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1344/// element in operand \a V1 is copied to the same position in the
1345/// destination. When a mask bit is 1, the corresponding 64-bit element in
1346/// operand \a V2 is copied to the same position in the destination.
1347/// \returns A 256-bit vector of [4 x double] containing the copied values.
1348#define _mm256_blend_pd(V1, V2, M) \
1349 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1350 (__v4df)(__m256d)(V2), (int)(M)))
1351
1352/// Merges 32-bit single-precision data values stored in either of the
1353/// two 256-bit vectors of [8 x float], as specified by the immediate
1354/// integer operand.
1355///
1356/// \headerfile <x86intrin.h>
1357///
1358/// \code
1359/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1360/// \endcode
1361///
1362/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1363///
1364/// \param V1
1365/// A 256-bit vector of [8 x float].
1366/// \param V2
1367/// A 256-bit vector of [8 x float].
1368/// \param M
1369/// An immediate integer operand, with mask bits [7:0] specifying how the
1370/// values are to be copied. The position of the mask bit corresponds to the
1371/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1372/// element in operand \a V1 is copied to the same position in the
1373/// destination. When a mask bit is 1, the corresponding 32-bit element in
1374/// operand \a V2 is copied to the same position in the destination.
1375/// \returns A 256-bit vector of [8 x float] containing the copied values.
1376#define _mm256_blend_ps(V1, V2, M) \
1377 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1378 (__v8sf)(__m256)(V2), (int)(M)))
1379
1380/// Merges 64-bit double-precision data values stored in either of the
1381/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1382/// operand.
1383///
1384/// \headerfile <x86intrin.h>
1385///
1386/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1387///
1388/// \param __a
1389/// A 256-bit vector of [4 x double].
1390/// \param __b
1391/// A 256-bit vector of [4 x double].
1392/// \param __c
1393/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1394/// how the values are to be copied. The position of the mask bit corresponds
1395/// to the most significant bit of a copied value. When a mask bit is 0, the
1396/// corresponding 64-bit element in operand \a __a is copied to the same
1397/// position in the destination. When a mask bit is 1, the corresponding
1398/// 64-bit element in operand \a __b is copied to the same position in the
1399/// destination.
1400/// \returns A 256-bit vector of [4 x double] containing the copied values.
1401static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
1402_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {
1403 return (__m256d)__builtin_ia32_blendvpd256(
1404 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1405}
1406
1407/// Merges 32-bit single-precision data values stored in either of the
1408/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1409/// operand.
1410///
1411/// \headerfile <x86intrin.h>
1412///
1413/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1414///
1415/// \param __a
1416/// A 256-bit vector of [8 x float].
1417/// \param __b
1418/// A 256-bit vector of [8 x float].
1419/// \param __c
1420/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1421/// and 31 specifying how the values are to be copied. The position of the
1422/// mask bit corresponds to the most significant bit of a copied value. When
1423/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1424/// copied to the same position in the destination. When a mask bit is 1, the
1425/// corresponding 32-bit element in operand \a __b is copied to the same
1426/// position in the destination.
1427/// \returns A 256-bit vector of [8 x float] containing the copied values.
1428static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
1429_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {
1430 return (__m256)__builtin_ia32_blendvps256(
1431 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1432}
1433
1434/* Vector Dot Product */
1435/// Computes two dot products in parallel, using the lower and upper
1436/// halves of two [8 x float] vectors as input to the two computations, and
1437/// returning the two dot products in the lower and upper halves of the
1438/// [8 x float] result.
1439///
1440/// The immediate integer operand controls which input elements will
1441/// contribute to the dot product, and where the final results are returned.
1442/// In general, for each dot product, the four corresponding elements of the
1443/// input vectors are multiplied; the first two and second two products are
1444/// summed, then the two sums are added to form the final result.
1445///
1446/// \headerfile <x86intrin.h>
1447///
1448/// \code
1449/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1450/// \endcode
1451///
1452/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1453///
1454/// \param V1
1455/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1456/// \param V2
1457/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1458/// \param M
1459/// An immediate integer argument. Bits [7:4] determine which elements of
1460/// the input vectors are used, with bit [4] corresponding to the lowest
1461/// element and bit [7] corresponding to the highest element of each [4 x
1462/// float] subvector. If a bit is set, the corresponding elements from the
1463/// two input vectors are used as an input for dot product; otherwise that
1464/// input is treated as zero. Bits [3:0] determine which elements of the
1465/// result will receive a copy of the final dot product, with bit [0]
1466/// corresponding to the lowest element and bit [3] corresponding to the
1467/// highest element of each [4 x float] subvector. If a bit is set, the dot
1468/// product is returned in the corresponding element; otherwise that element
1469/// is set to zero. The bitmask is applied in the same way to each of the
1470/// two parallel dot product computations.
1471/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1472#define _mm256_dp_ps(V1, V2, M) \
1473 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1474 (__v8sf)(__m256)(V2), (M)))
1475
1476/* Vector shuffle */
1477/// Selects 8 float values from the 256-bit operands of [8 x float], as
1478/// specified by the immediate value operand.
1479///
1480/// The four selected elements in each operand are copied to the destination
1481/// according to the bits specified in the immediate operand. The selected
1482/// elements from the first 256-bit operand are copied to bits [63:0] and
1483/// bits [191:128] of the destination, and the selected elements from the
1484/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1485/// the destination. For example, if bits [7:0] of the immediate operand
1486/// contain a value of 0xFF, the 256-bit destination vector would contain the
1487/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1488///
1489/// \headerfile <x86intrin.h>
1490///
1491/// \code
1492/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1493/// \endcode
1494///
1495/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1496///
1497/// \param a
1498/// A 256-bit vector of [8 x float]. The four selected elements in this
1499/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1500/// according to the bits specified in the immediate operand.
1501/// \param b
1502/// A 256-bit vector of [8 x float]. The four selected elements in this
1503/// operand are copied to bits [127:64] and bits [255:192] in the
1504/// destination, according to the bits specified in the immediate operand.
1505/// \param mask
1506/// An immediate value containing an 8-bit value specifying which elements to
1507/// copy from \a a and \a b \n.
1508/// Bits [3:0] specify the values copied from operand \a a. \n
1509/// Bits [7:4] specify the values copied from operand \a b. \n
1510/// The destinations within the 256-bit destination are assigned values as
1511/// follows, according to the bit value assignments described below: \n
1512/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1513/// destination. \n
1514/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1515/// destination. \n
1516/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1517/// destination. \n
1518/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1519/// the destination. \n
1520/// Bit value assignments: \n
1521/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1522/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1523/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1524/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1525/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1526/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1527/// <c>[b6, b4, b2, b0]</c>.
1528/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1529#define _mm256_shuffle_ps(a, b, mask) \
1530 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1531 (__v8sf)(__m256)(b), (int)(mask)))
1532
1533/// Selects four double-precision values from the 256-bit operands of
1534/// [4 x double], as specified by the immediate value operand.
1535///
1536/// The selected elements from the first 256-bit operand are copied to bits
1537/// [63:0] and bits [191:128] in the destination, and the selected elements
1538/// from the second 256-bit operand are copied to bits [127:64] and bits
1539/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1540/// operand contain a value of 0xF, the 256-bit destination vector would
1541/// contain the following values: b[3], a[3], b[1], a[1].
1542///
1543/// \headerfile <x86intrin.h>
1544///
1545/// \code
1546/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1547/// \endcode
1548///
1549/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1550///
1551/// \param a
1552/// A 256-bit vector of [4 x double].
1553/// \param b
1554/// A 256-bit vector of [4 x double].
1555/// \param mask
1556/// An immediate value containing 8-bit values specifying which elements to
1557/// copy from \a a and \a b: \n
1558/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1559/// destination. \n
1560/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1561/// destination. \n
1562/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1563/// destination. \n
1564/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1565/// destination. \n
1566/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1567/// destination. \n
1568/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1569/// destination. \n
1570/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1571/// destination. \n
1572/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1573/// destination.
1574/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1575#define _mm256_shuffle_pd(a, b, mask) \
1576 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1577 (__v4df)(__m256d)(b), (int)(mask)))
1578
1579/* Compare */
1580#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1581#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1582#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1583#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1584#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1585#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1586#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1587#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1588#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1589#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1590#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1591#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1592#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1593#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1594#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1595#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1596#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1597#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1598#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1599#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1600#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1601#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1602#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1603#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1604
1605/* Below intrinsic defined in emmintrin.h can be used for AVX */
1606/// Compares each of the corresponding double-precision values of two
1607/// 128-bit vectors of [2 x double], using the operation specified by the
1608/// immediate integer operand.
1609///
1610/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1611/// If either value in a comparison is NaN, comparisons that are ordered
1612/// return false, and comparisons that are unordered return true.
1613///
1614/// \headerfile <x86intrin.h>
1615///
1616/// \code
1617/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1618/// \endcode
1619///
1620/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1621///
1622/// \param a
1623/// A 128-bit vector of [2 x double].
1624/// \param b
1625/// A 128-bit vector of [2 x double].
1626/// \param c
1627/// An immediate integer operand, with bits [4:0] specifying which comparison
1628/// operation to use: \n
1629/// 0x00: Equal (ordered, non-signaling) \n
1630/// 0x01: Less-than (ordered, signaling) \n
1631/// 0x02: Less-than-or-equal (ordered, signaling) \n
1632/// 0x03: Unordered (non-signaling) \n
1633/// 0x04: Not-equal (unordered, non-signaling) \n
1634/// 0x05: Not-less-than (unordered, signaling) \n
1635/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1636/// 0x07: Ordered (non-signaling) \n
1637/// 0x08: Equal (unordered, non-signaling) \n
1638/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1639/// 0x0A: Not-greater-than (unordered, signaling) \n
1640/// 0x0B: False (ordered, non-signaling) \n
1641/// 0x0C: Not-equal (ordered, non-signaling) \n
1642/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1643/// 0x0E: Greater-than (ordered, signaling) \n
1644/// 0x0F: True (unordered, non-signaling) \n
1645/// 0x10: Equal (ordered, signaling) \n
1646/// 0x11: Less-than (ordered, non-signaling) \n
1647/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1648/// 0x13: Unordered (signaling) \n
1649/// 0x14: Not-equal (unordered, signaling) \n
1650/// 0x15: Not-less-than (unordered, non-signaling) \n
1651/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1652/// 0x17: Ordered (signaling) \n
1653/// 0x18: Equal (unordered, signaling) \n
1654/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1655/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1656/// 0x1B: False (ordered, signaling) \n
1657/// 0x1C: Not-equal (ordered, signaling) \n
1658/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1659/// 0x1E: Greater-than (ordered, non-signaling) \n
1660/// 0x1F: True (unordered, signaling)
1661/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1662/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1663
1664/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1665/// Compares each of the corresponding values of two 128-bit vectors of
1666/// [4 x float], using the operation specified by the immediate integer
1667/// operand.
1668///
1669/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1670/// If either value in a comparison is NaN, comparisons that are ordered
1671/// return false, and comparisons that are unordered return true.
1672///
1673/// \headerfile <x86intrin.h>
1674///
1675/// \code
1676/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1677/// \endcode
1678///
1679/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1680///
1681/// \param a
1682/// A 128-bit vector of [4 x float].
1683/// \param b
1684/// A 128-bit vector of [4 x float].
1685/// \param c
1686/// An immediate integer operand, with bits [4:0] specifying which comparison
1687/// operation to use: \n
1688/// 0x00: Equal (ordered, non-signaling) \n
1689/// 0x01: Less-than (ordered, signaling) \n
1690/// 0x02: Less-than-or-equal (ordered, signaling) \n
1691/// 0x03: Unordered (non-signaling) \n
1692/// 0x04: Not-equal (unordered, non-signaling) \n
1693/// 0x05: Not-less-than (unordered, signaling) \n
1694/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1695/// 0x07: Ordered (non-signaling) \n
1696/// 0x08: Equal (unordered, non-signaling) \n
1697/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1698/// 0x0A: Not-greater-than (unordered, signaling) \n
1699/// 0x0B: False (ordered, non-signaling) \n
1700/// 0x0C: Not-equal (ordered, non-signaling) \n
1701/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1702/// 0x0E: Greater-than (ordered, signaling) \n
1703/// 0x0F: True (unordered, non-signaling) \n
1704/// 0x10: Equal (ordered, signaling) \n
1705/// 0x11: Less-than (ordered, non-signaling) \n
1706/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1707/// 0x13: Unordered (signaling) \n
1708/// 0x14: Not-equal (unordered, signaling) \n
1709/// 0x15: Not-less-than (unordered, non-signaling) \n
1710/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1711/// 0x17: Ordered (signaling) \n
1712/// 0x18: Equal (unordered, signaling) \n
1713/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1714/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1715/// 0x1B: False (ordered, signaling) \n
1716/// 0x1C: Not-equal (ordered, signaling) \n
1717/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1718/// 0x1E: Greater-than (ordered, non-signaling) \n
1719/// 0x1F: True (unordered, signaling)
1720/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1721/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1722
1723/// Compares each of the corresponding double-precision values of two
1724/// 256-bit vectors of [4 x double], using the operation specified by the
1725/// immediate integer operand.
1726///
1727/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1728/// If either value in a comparison is NaN, comparisons that are ordered
1729/// return false, and comparisons that are unordered return true.
1730///
1731/// \headerfile <x86intrin.h>
1732///
1733/// \code
1734/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1735/// \endcode
1736///
1737/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1738///
1739/// \param a
1740/// A 256-bit vector of [4 x double].
1741/// \param b
1742/// A 256-bit vector of [4 x double].
1743/// \param c
1744/// An immediate integer operand, with bits [4:0] specifying which comparison
1745/// operation to use: \n
1746/// 0x00: Equal (ordered, non-signaling) \n
1747/// 0x01: Less-than (ordered, signaling) \n
1748/// 0x02: Less-than-or-equal (ordered, signaling) \n
1749/// 0x03: Unordered (non-signaling) \n
1750/// 0x04: Not-equal (unordered, non-signaling) \n
1751/// 0x05: Not-less-than (unordered, signaling) \n
1752/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1753/// 0x07: Ordered (non-signaling) \n
1754/// 0x08: Equal (unordered, non-signaling) \n
1755/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1756/// 0x0A: Not-greater-than (unordered, signaling) \n
1757/// 0x0B: False (ordered, non-signaling) \n
1758/// 0x0C: Not-equal (ordered, non-signaling) \n
1759/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1760/// 0x0E: Greater-than (ordered, signaling) \n
1761/// 0x0F: True (unordered, non-signaling) \n
1762/// 0x10: Equal (ordered, signaling) \n
1763/// 0x11: Less-than (ordered, non-signaling) \n
1764/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1765/// 0x13: Unordered (signaling) \n
1766/// 0x14: Not-equal (unordered, signaling) \n
1767/// 0x15: Not-less-than (unordered, non-signaling) \n
1768/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1769/// 0x17: Ordered (signaling) \n
1770/// 0x18: Equal (unordered, signaling) \n
1771/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1772/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1773/// 0x1B: False (ordered, signaling) \n
1774/// 0x1C: Not-equal (ordered, signaling) \n
1775/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1776/// 0x1E: Greater-than (ordered, non-signaling) \n
1777/// 0x1F: True (unordered, signaling)
1778/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1779#define _mm256_cmp_pd(a, b, c) \
1780 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1781 (__v4df)(__m256d)(b), (c)))
1782
1783/// Compares each of the corresponding values of two 256-bit vectors of
1784/// [8 x float], using the operation specified by the immediate integer
1785/// operand.
1786///
1787/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1788/// If either value in a comparison is NaN, comparisons that are ordered
1789/// return false, and comparisons that are unordered return true.
1790///
1791/// \headerfile <x86intrin.h>
1792///
1793/// \code
1794/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1795/// \endcode
1796///
1797/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1798///
1799/// \param a
1800/// A 256-bit vector of [8 x float].
1801/// \param b
1802/// A 256-bit vector of [8 x float].
1803/// \param c
1804/// An immediate integer operand, with bits [4:0] specifying which comparison
1805/// operation to use: \n
1806/// 0x00: Equal (ordered, non-signaling) \n
1807/// 0x01: Less-than (ordered, signaling) \n
1808/// 0x02: Less-than-or-equal (ordered, signaling) \n
1809/// 0x03: Unordered (non-signaling) \n
1810/// 0x04: Not-equal (unordered, non-signaling) \n
1811/// 0x05: Not-less-than (unordered, signaling) \n
1812/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1813/// 0x07: Ordered (non-signaling) \n
1814/// 0x08: Equal (unordered, non-signaling) \n
1815/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1816/// 0x0A: Not-greater-than (unordered, signaling) \n
1817/// 0x0B: False (ordered, non-signaling) \n
1818/// 0x0C: Not-equal (ordered, non-signaling) \n
1819/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1820/// 0x0E: Greater-than (ordered, signaling) \n
1821/// 0x0F: True (unordered, non-signaling) \n
1822/// 0x10: Equal (ordered, signaling) \n
1823/// 0x11: Less-than (ordered, non-signaling) \n
1824/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1825/// 0x13: Unordered (signaling) \n
1826/// 0x14: Not-equal (unordered, signaling) \n
1827/// 0x15: Not-less-than (unordered, non-signaling) \n
1828/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1829/// 0x17: Ordered (signaling) \n
1830/// 0x18: Equal (unordered, signaling) \n
1831/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1832/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1833/// 0x1B: False (ordered, signaling) \n
1834/// 0x1C: Not-equal (ordered, signaling) \n
1835/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1836/// 0x1E: Greater-than (ordered, non-signaling) \n
1837/// 0x1F: True (unordered, signaling)
1838/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1839#define _mm256_cmp_ps(a, b, c) \
1840 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1841 (__v8sf)(__m256)(b), (c)))
1842
1843/* Below intrinsic defined in emmintrin.h can be used for AVX */
1844/// Compares each of the corresponding scalar double-precision values of
1845/// two 128-bit vectors of [2 x double], using the operation specified by the
1846/// immediate integer operand.
1847///
1848/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1849/// If either value in a comparison is NaN, comparisons that are ordered
1850/// return false, and comparisons that are unordered return true.
1851///
1852/// \headerfile <x86intrin.h>
1853///
1854/// \code
1855/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1856/// \endcode
1857///
1858/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1859///
1860/// \param a
1861/// A 128-bit vector of [2 x double].
1862/// \param b
1863/// A 128-bit vector of [2 x double].
1864/// \param c
1865/// An immediate integer operand, with bits [4:0] specifying which comparison
1866/// operation to use: \n
1867/// 0x00: Equal (ordered, non-signaling) \n
1868/// 0x01: Less-than (ordered, signaling) \n
1869/// 0x02: Less-than-or-equal (ordered, signaling) \n
1870/// 0x03: Unordered (non-signaling) \n
1871/// 0x04: Not-equal (unordered, non-signaling) \n
1872/// 0x05: Not-less-than (unordered, signaling) \n
1873/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1874/// 0x07: Ordered (non-signaling) \n
1875/// 0x08: Equal (unordered, non-signaling) \n
1876/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1877/// 0x0A: Not-greater-than (unordered, signaling) \n
1878/// 0x0B: False (ordered, non-signaling) \n
1879/// 0x0C: Not-equal (ordered, non-signaling) \n
1880/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1881/// 0x0E: Greater-than (ordered, signaling) \n
1882/// 0x0F: True (unordered, non-signaling) \n
1883/// 0x10: Equal (ordered, signaling) \n
1884/// 0x11: Less-than (ordered, non-signaling) \n
1885/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1886/// 0x13: Unordered (signaling) \n
1887/// 0x14: Not-equal (unordered, signaling) \n
1888/// 0x15: Not-less-than (unordered, non-signaling) \n
1889/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1890/// 0x17: Ordered (signaling) \n
1891/// 0x18: Equal (unordered, signaling) \n
1892/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1893/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1894/// 0x1B: False (ordered, signaling) \n
1895/// 0x1C: Not-equal (ordered, signaling) \n
1896/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1897/// 0x1E: Greater-than (ordered, non-signaling) \n
1898/// 0x1F: True (unordered, signaling)
1899/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1900/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1901
1902/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1903/// Compares each of the corresponding scalar values of two 128-bit
1904/// vectors of [4 x float], using the operation specified by the immediate
1905/// integer operand.
1906///
1907/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1908/// If either value in a comparison is NaN, comparisons that are ordered
1909/// return false, and comparisons that are unordered return true.
1910///
1911/// \headerfile <x86intrin.h>
1912///
1913/// \code
1914/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1915/// \endcode
1916///
1917/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1918///
1919/// \param a
1920/// A 128-bit vector of [4 x float].
1921/// \param b
1922/// A 128-bit vector of [4 x float].
1923/// \param c
1924/// An immediate integer operand, with bits [4:0] specifying which comparison
1925/// operation to use: \n
1926/// 0x00: Equal (ordered, non-signaling) \n
1927/// 0x01: Less-than (ordered, signaling) \n
1928/// 0x02: Less-than-or-equal (ordered, signaling) \n
1929/// 0x03: Unordered (non-signaling) \n
1930/// 0x04: Not-equal (unordered, non-signaling) \n
1931/// 0x05: Not-less-than (unordered, signaling) \n
1932/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1933/// 0x07: Ordered (non-signaling) \n
1934/// 0x08: Equal (unordered, non-signaling) \n
1935/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1936/// 0x0A: Not-greater-than (unordered, signaling) \n
1937/// 0x0B: False (ordered, non-signaling) \n
1938/// 0x0C: Not-equal (ordered, non-signaling) \n
1939/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1940/// 0x0E: Greater-than (ordered, signaling) \n
1941/// 0x0F: True (unordered, non-signaling) \n
1942/// 0x10: Equal (ordered, signaling) \n
1943/// 0x11: Less-than (ordered, non-signaling) \n
1944/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1945/// 0x13: Unordered (signaling) \n
1946/// 0x14: Not-equal (unordered, signaling) \n
1947/// 0x15: Not-less-than (unordered, non-signaling) \n
1948/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1949/// 0x17: Ordered (signaling) \n
1950/// 0x18: Equal (unordered, signaling) \n
1951/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1952/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1953/// 0x1B: False (ordered, signaling) \n
1954/// 0x1C: Not-equal (ordered, signaling) \n
1955/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1956/// 0x1E: Greater-than (ordered, non-signaling) \n
1957/// 0x1F: True (unordered, signaling)
1958/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1959/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1960
1961/// Takes a [8 x i32] vector and returns the vector element value
1962/// indexed by the immediate constant operand.
1963///
1964/// \headerfile <x86intrin.h>
1965///
1966/// \code
1967/// int _mm256_extract_epi32(__m256i X, const int N);
1968/// \endcode
1969///
1970/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1971/// instruction.
1972///
1973/// \param X
1974/// A 256-bit vector of [8 x i32].
1975/// \param N
1976/// An immediate integer operand with bits [2:0] determining which vector
1977/// element is extracted and returned.
1978/// \returns A 32-bit integer containing the extracted 32 bits of extended
1979/// packed data.
1980#define _mm256_extract_epi32(X, N) \
1981 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1982
1983/// Takes a [16 x i16] vector and returns the vector element value
1984/// indexed by the immediate constant operand.
1985///
1986/// \headerfile <x86intrin.h>
1987///
1988/// \code
1989/// int _mm256_extract_epi16(__m256i X, const int N);
1990/// \endcode
1991///
1992/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1993/// instruction.
1994///
1995/// \param X
1996/// A 256-bit integer vector of [16 x i16].
1997/// \param N
1998/// An immediate integer operand with bits [3:0] determining which vector
1999/// element is extracted and returned.
2000/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2001/// packed data.
2002#define _mm256_extract_epi16(X, N) \
2003 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2004 (int)(N)))
2005
2006/// Takes a [32 x i8] vector and returns the vector element value
2007/// indexed by the immediate constant operand.
2008///
2009/// \headerfile <x86intrin.h>
2010///
2011/// \code
2012/// int _mm256_extract_epi8(__m256i X, const int N);
2013/// \endcode
2014///
2015/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2016/// instruction.
2017///
2018/// \param X
2019/// A 256-bit integer vector of [32 x i8].
2020/// \param N
2021/// An immediate integer operand with bits [4:0] determining which vector
2022/// element is extracted and returned.
2023/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2024/// packed data.
2025#define _mm256_extract_epi8(X, N) \
2026 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2027 (int)(N)))
2028
2029#ifdef __x86_64__
2030/// Takes a [4 x i64] vector and returns the vector element value
2031/// indexed by the immediate constant operand.
2032///
2033/// \headerfile <x86intrin.h>
2034///
2035/// \code
2036/// long long _mm256_extract_epi64(__m256i X, const int N);
2037/// \endcode
2038///
2039/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2040/// instruction.
2041///
2042/// \param X
2043/// A 256-bit integer vector of [4 x i64].
2044/// \param N
2045/// An immediate integer operand with bits [1:0] determining which vector
2046/// element is extracted and returned.
2047/// \returns A 64-bit integer containing the extracted 64 bits of extended
2048/// packed data.
2049#define _mm256_extract_epi64(X, N) \
2050 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2051#endif
2052
2053/// Takes a [8 x i32] vector and replaces the vector element value
2054/// indexed by the immediate constant operand by a new value. Returns the
2055/// modified vector.
2056///
2057/// \headerfile <x86intrin.h>
2058///
2059/// \code
2060/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2061/// \endcode
2062///
2063/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2064/// instruction.
2065///
2066/// \param X
2067/// A vector of [8 x i32] to be used by the insert operation.
2068/// \param I
2069/// An integer value. The replacement value for the insert operation.
2070/// \param N
2071/// An immediate integer specifying the index of the vector element to be
2072/// replaced.
2073/// \returns A copy of vector \a X, after replacing its element indexed by
2074/// \a N with \a I.
2075#define _mm256_insert_epi32(X, I, N) \
2076 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2077 (int)(I), (int)(N)))
2078
2079
2080/// Takes a [16 x i16] vector and replaces the vector element value
2081/// indexed by the immediate constant operand with a new value. Returns the
2082/// modified vector.
2083///
2084/// \headerfile <x86intrin.h>
2085///
2086/// \code
2087/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2088/// \endcode
2089///
2090/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2091/// instruction.
2092///
2093/// \param X
2094/// A vector of [16 x i16] to be used by the insert operation.
2095/// \param I
2096/// An i16 integer value. The replacement value for the insert operation.
2097/// \param N
2098/// An immediate integer specifying the index of the vector element to be
2099/// replaced.
2100/// \returns A copy of vector \a X, after replacing its element indexed by
2101/// \a N with \a I.
2102#define _mm256_insert_epi16(X, I, N) \
2103 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2104 (int)(I), (int)(N)))
2105
2106/// Takes a [32 x i8] vector and replaces the vector element value
2107/// indexed by the immediate constant operand with a new value. Returns the
2108/// modified vector.
2109///
2110/// \headerfile <x86intrin.h>
2111///
2112/// \code
2113/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2114/// \endcode
2115///
2116/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2117/// instruction.
2118///
2119/// \param X
2120/// A vector of [32 x i8] to be used by the insert operation.
2121/// \param I
2122/// An i8 integer value. The replacement value for the insert operation.
2123/// \param N
2124/// An immediate integer specifying the index of the vector element to be
2125/// replaced.
2126/// \returns A copy of vector \a X, after replacing its element indexed by
2127/// \a N with \a I.
2128#define _mm256_insert_epi8(X, I, N) \
2129 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2130 (int)(I), (int)(N)))
2131
2132#ifdef __x86_64__
2133/// Takes a [4 x i64] vector and replaces the vector element value
2134/// indexed by the immediate constant operand with a new value. Returns the
2135/// modified vector.
2136///
2137/// \headerfile <x86intrin.h>
2138///
2139/// \code
2140/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2141/// \endcode
2142///
2143/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2144/// instruction.
2145///
2146/// \param X
2147/// A vector of [4 x i64] to be used by the insert operation.
2148/// \param I
2149/// A 64-bit integer value. The replacement value for the insert operation.
2150/// \param N
2151/// An immediate integer specifying the index of the vector element to be
2152/// replaced.
2153/// \returns A copy of vector \a X, after replacing its element indexed by
2154/// \a N with \a I.
2155#define _mm256_insert_epi64(X, I, N) \
2156 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2157 (long long)(I), (int)(N)))
2158#endif
2159
2160/* Conversion */
2161/// Converts a vector of [4 x i32] into a vector of [4 x double].
2162///
2163/// \headerfile <x86intrin.h>
2164///
2165/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2166///
2167/// \param __a
2168/// A 128-bit integer vector of [4 x i32].
2169/// \returns A 256-bit vector of [4 x double] containing the converted values.
2170static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2172 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2173}
2174
2175/// Converts a vector of [8 x i32] into a vector of [8 x float].
2176///
2177/// \headerfile <x86intrin.h>
2178///
2179/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2180///
2181/// \param __a
2182/// A 256-bit integer vector.
2183/// \returns A 256-bit vector of [8 x float] containing the converted values.
2184static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2186 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2187}
2188
2189/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2190/// [4 x float].
2191///
2192/// \headerfile <x86intrin.h>
2193///
2194/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2195///
2196/// \param __a
2197/// A 256-bit vector of [4 x double].
2198/// \returns A 128-bit vector of [4 x float] containing the converted values.
2199static __inline __m128 __DEFAULT_FN_ATTRS
2201{
2202 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2203}
2204
2205/// Converts a vector of [8 x float] into a vector of [8 x i32].
2206///
2207/// If a converted value does not fit in a 32-bit integer, raises a
2208/// floating-point invalid exception. If the exception is masked, returns
2209/// the most negative integer.
2210///
2211/// \headerfile <x86intrin.h>
2212///
2213/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2214///
2215/// \param __a
2216/// A 256-bit vector of [8 x float].
2217/// \returns A 256-bit integer vector containing the converted values.
2218static __inline __m256i __DEFAULT_FN_ATTRS
2220{
2221 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2222}
2223
2224/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2225/// x double].
2226///
2227/// \headerfile <x86intrin.h>
2228///
2229/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2230///
2231/// \param __a
2232/// A 128-bit vector of [4 x float].
2233/// \returns A 256-bit vector of [4 x double] containing the converted values.
2234static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2236 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2237}
2238
2239/// Converts a 256-bit vector of [4 x double] into four signed truncated
2240/// (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2241/// [4 x i32].
2242///
2243/// If a converted value does not fit in a 32-bit integer, raises a
2244/// floating-point invalid exception. If the exception is masked, returns
2245/// the most negative integer.
2246///
2247/// \headerfile <x86intrin.h>
2248///
2249/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2250///
2251/// \param __a
2252/// A 256-bit vector of [4 x double].
2253/// \returns A 128-bit integer vector containing the converted values.
2254static __inline __m128i __DEFAULT_FN_ATTRS
2256{
2257 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2258}
2259
2260/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2261/// [4 x i32].
2262///
2263/// If a converted value does not fit in a 32-bit integer, raises a
2264/// floating-point invalid exception. If the exception is masked, returns
2265/// the most negative integer.
2266///
2267/// \headerfile <x86intrin.h>
2268///
2269/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2270///
2271/// \param __a
2272/// A 256-bit vector of [4 x double].
2273/// \returns A 128-bit integer vector containing the converted values.
2274static __inline __m128i __DEFAULT_FN_ATTRS
2276{
2277 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2278}
2279
2280/// Converts a vector of [8 x float] into eight signed truncated (rounded
2281/// toward zero) 32-bit integers returned in a vector of [8 x i32].
2282///
2283/// If a converted value does not fit in a 32-bit integer, raises a
2284/// floating-point invalid exception. If the exception is masked, returns
2285/// the most negative integer.
2286///
2287/// \headerfile <x86intrin.h>
2288///
2289/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2290///
2291/// \param __a
2292/// A 256-bit vector of [8 x float].
2293/// \returns A 256-bit integer vector containing the converted values.
2294static __inline __m256i __DEFAULT_FN_ATTRS
2296{
2297 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2298}
2299
2300/// Returns the first element of the input vector of [4 x double].
2301///
2302/// \headerfile <x86intrin.h>
2303///
2304/// This intrinsic is a utility function and does not correspond to a specific
2305/// instruction.
2306///
2307/// \param __a
2308/// A 256-bit vector of [4 x double].
2309/// \returns A 64 bit double containing the first element of the input vector.
2310static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR
2312 return __a[0];
2313}
2314
2315/// Returns the first element of the input vector of [8 x i32].
2316///
2317/// \headerfile <x86intrin.h>
2318///
2319/// This intrinsic is a utility function and does not correspond to a specific
2320/// instruction.
2321///
2322/// \param __a
2323/// A 256-bit vector of [8 x i32].
2324/// \returns A 32 bit integer containing the first element of the input vector.
2325static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2327 __v8si __b = (__v8si)__a;
2328 return __b[0];
2329}
2330
2331/// Returns the first element of the input vector of [8 x float].
2332///
2333/// \headerfile <x86intrin.h>
2334///
2335/// This intrinsic is a utility function and does not correspond to a specific
2336/// instruction.
2337///
2338/// \param __a
2339/// A 256-bit vector of [8 x float].
2340/// \returns A 32 bit float containing the first element of the input vector.
2341static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR
2343 return __a[0];
2344}
2345
2346/* Vector replicate */
2347/// Moves and duplicates odd-indexed values from a 256-bit vector of
2348/// [8 x float] to float values in a 256-bit vector of [8 x float].
2349///
2350/// \headerfile <x86intrin.h>
2351///
2352/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2353///
2354/// \param __a
2355/// A 256-bit vector of [8 x float]. \n
2356/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2357/// the return value. \n
2358/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2359/// the return value. \n
2360/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2361/// return value. \n
2362/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2363/// return value.
2364/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2365/// values.
2366static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2368{
2369 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2370}
2371
2372/// Moves and duplicates even-indexed values from a 256-bit vector of
2373/// [8 x float] to float values in a 256-bit vector of [8 x float].
2374///
2375/// \headerfile <x86intrin.h>
2376///
2377/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2378///
2379/// \param __a
2380/// A 256-bit vector of [8 x float]. \n
2381/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2382/// the return value. \n
2383/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2384/// the return value. \n
2385/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2386/// return value. \n
2387/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2388/// return value.
2389/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2390/// values.
2391static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2393{
2394 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2395}
2396
2397/// Moves and duplicates double-precision floating point values from a
2398/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2399/// vector of [4 x double].
2400///
2401/// \headerfile <x86intrin.h>
2402///
2403/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2404///
2405/// \param __a
2406/// A 256-bit vector of [4 x double]. \n
2407/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2408/// return value. \n
2409/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2410/// the return value.
2411/// \returns A 256-bit vector of [4 x double] containing the moved and
2412/// duplicated values.
2413static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2415{
2416 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2417}
2418
2419/* Unpack and Interleave */
2420/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2421/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2422///
2423/// \headerfile <x86intrin.h>
2424///
2425/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2426///
2427/// \param __a
2428/// A 256-bit floating-point vector of [4 x double]. \n
2429/// Bits [127:64] are written to bits [63:0] of the return value. \n
2430/// Bits [255:192] are written to bits [191:128] of the return value. \n
2431/// \param __b
2432/// A 256-bit floating-point vector of [4 x double]. \n
2433/// Bits [127:64] are written to bits [127:64] of the return value. \n
2434/// Bits [255:192] are written to bits [255:192] of the return value. \n
2435/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2436static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2437_mm256_unpackhi_pd(__m256d __a, __m256d __b) {
2438 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2439}
2440
2441/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2442/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2443///
2444/// \headerfile <x86intrin.h>
2445///
2446/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2447///
2448/// \param __a
2449/// A 256-bit floating-point vector of [4 x double]. \n
2450/// Bits [63:0] are written to bits [63:0] of the return value. \n
2451/// Bits [191:128] are written to bits [191:128] of the return value.
2452/// \param __b
2453/// A 256-bit floating-point vector of [4 x double]. \n
2454/// Bits [63:0] are written to bits [127:64] of the return value. \n
2455/// Bits [191:128] are written to bits [255:192] of the return value. \n
2456/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2457static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2458_mm256_unpacklo_pd(__m256d __a, __m256d __b) {
2459 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2460}
2461
2462/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2463/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2464/// vector of [8 x float].
2465///
2466/// \headerfile <x86intrin.h>
2467///
2468/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2469///
2470/// \param __a
2471/// A 256-bit vector of [8 x float]. \n
2472/// Bits [95:64] are written to bits [31:0] of the return value. \n
2473/// Bits [127:96] are written to bits [95:64] of the return value. \n
2474/// Bits [223:192] are written to bits [159:128] of the return value. \n
2475/// Bits [255:224] are written to bits [223:192] of the return value.
2476/// \param __b
2477/// A 256-bit vector of [8 x float]. \n
2478/// Bits [95:64] are written to bits [63:32] of the return value. \n
2479/// Bits [127:96] are written to bits [127:96] of the return value. \n
2480/// Bits [223:192] are written to bits [191:160] of the return value. \n
2481/// Bits [255:224] are written to bits [255:224] of the return value.
2482/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2483static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2484_mm256_unpackhi_ps(__m256 __a, __m256 __b) {
2485 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2486}
2487
2488/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2489/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2490/// vector of [8 x float].
2491///
2492/// \headerfile <x86intrin.h>
2493///
2494/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2495///
2496/// \param __a
2497/// A 256-bit vector of [8 x float]. \n
2498/// Bits [31:0] are written to bits [31:0] of the return value. \n
2499/// Bits [63:32] are written to bits [95:64] of the return value. \n
2500/// Bits [159:128] are written to bits [159:128] of the return value. \n
2501/// Bits [191:160] are written to bits [223:192] of the return value.
2502/// \param __b
2503/// A 256-bit vector of [8 x float]. \n
2504/// Bits [31:0] are written to bits [63:32] of the return value. \n
2505/// Bits [63:32] are written to bits [127:96] of the return value. \n
2506/// Bits [159:128] are written to bits [191:160] of the return value. \n
2507/// Bits [191:160] are written to bits [255:224] of the return value.
2508/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2509static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2510_mm256_unpacklo_ps(__m256 __a, __m256 __b) {
2511 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2512}
2513
2514/* Bit Test */
2515/// Given two 128-bit floating-point vectors of [2 x double], perform an
2516/// element-by-element comparison of the double-precision element in the
2517/// first source vector and the corresponding element in the second source
2518/// vector.
2519///
2520/// The EFLAGS register is updated as follows: \n
2521/// If there is at least one pair of double-precision elements where the
2522/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2523/// ZF flag is set to 1. \n
2524/// If there is at least one pair of double-precision elements where the
2525/// sign-bit of the first element is 0 and the sign-bit of the second element
2526/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2527/// This intrinsic returns the value of the ZF flag.
2528///
2529/// \headerfile <x86intrin.h>
2530///
2531/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2532///
2533/// \param __a
2534/// A 128-bit vector of [2 x double].
2535/// \param __b
2536/// A 128-bit vector of [2 x double].
2537/// \returns the ZF flag in the EFLAGS register.
2539 __m128d __b) {
2540 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2541}
2542
2543/// Given two 128-bit floating-point vectors of [2 x double], perform an
2544/// element-by-element comparison of the double-precision element in the
2545/// first source vector and the corresponding element in the second source
2546/// vector.
2547///
2548/// The EFLAGS register is updated as follows: \n
2549/// If there is at least one pair of double-precision elements where the
2550/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2551/// ZF flag is set to 1. \n
2552/// If there is at least one pair of double-precision elements where the
2553/// sign-bit of the first element is 0 and the sign-bit of the second element
2554/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2555/// This intrinsic returns the value of the CF flag.
2556///
2557/// \headerfile <x86intrin.h>
2558///
2559/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2560///
2561/// \param __a
2562/// A 128-bit vector of [2 x double].
2563/// \param __b
2564/// A 128-bit vector of [2 x double].
2565/// \returns the CF flag in the EFLAGS register.
2567 __m128d __b) {
2568 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2569}
2570
2571/// Given two 128-bit floating-point vectors of [2 x double], perform an
2572/// element-by-element comparison of the double-precision element in the
2573/// first source vector and the corresponding element in the second source
2574/// vector.
2575///
2576/// The EFLAGS register is updated as follows: \n
2577/// If there is at least one pair of double-precision elements where the
2578/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2579/// ZF flag is set to 1. \n
2580/// If there is at least one pair of double-precision elements where the
2581/// sign-bit of the first element is 0 and the sign-bit of the second element
2582/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2583/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2584/// otherwise it returns 0.
2585///
2586/// \headerfile <x86intrin.h>
2587///
2588/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2589///
2590/// \param __a
2591/// A 128-bit vector of [2 x double].
2592/// \param __b
2593/// A 128-bit vector of [2 x double].
2594/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2595static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR
2596_mm_testnzc_pd(__m128d __a, __m128d __b) {
2597 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2598}
2599
2600/// Given two 128-bit floating-point vectors of [4 x float], perform an
2601/// element-by-element comparison of the single-precision element in the
2602/// first source vector and the corresponding element in the second source
2603/// vector.
2604///
2605/// The EFLAGS register is updated as follows: \n
2606/// If there is at least one pair of single-precision elements where the
2607/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2608/// ZF flag is set to 1. \n
2609/// If there is at least one pair of single-precision elements where the
2610/// sign-bit of the first element is 0 and the sign-bit of the second element
2611/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2612/// This intrinsic returns the value of the ZF flag.
2613///
2614/// \headerfile <x86intrin.h>
2615///
2616/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2617///
2618/// \param __a
2619/// A 128-bit vector of [4 x float].
2620/// \param __b
2621/// A 128-bit vector of [4 x float].
2622/// \returns the ZF flag.
2624 __m128 __b) {
2625 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2626}
2627
2628/// Given two 128-bit floating-point vectors of [4 x float], perform an
2629/// element-by-element comparison of the single-precision element in the
2630/// first source vector and the corresponding element in the second source
2631/// vector.
2632///
2633/// The EFLAGS register is updated as follows: \n
2634/// If there is at least one pair of single-precision elements where the
2635/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2636/// ZF flag is set to 1. \n
2637/// If there is at least one pair of single-precision elements where the
2638/// sign-bit of the first element is 0 and the sign-bit of the second element
2639/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2640/// This intrinsic returns the value of the CF flag.
2641///
2642/// \headerfile <x86intrin.h>
2643///
2644/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2645///
2646/// \param __a
2647/// A 128-bit vector of [4 x float].
2648/// \param __b
2649/// A 128-bit vector of [4 x float].
2650/// \returns the CF flag.
2652 __m128 __b) {
2653 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2654}
2655
2656/// Given two 128-bit floating-point vectors of [4 x float], perform an
2657/// element-by-element comparison of the single-precision element in the
2658/// first source vector and the corresponding element in the second source
2659/// vector.
2660///
2661/// The EFLAGS register is updated as follows: \n
2662/// If there is at least one pair of single-precision elements where the
2663/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2664/// ZF flag is set to 1. \n
2665/// If there is at least one pair of single-precision elements where the
2666/// sign-bit of the first element is 0 and the sign-bit of the second element
2667/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2668/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2669/// otherwise it returns 0.
2670///
2671/// \headerfile <x86intrin.h>
2672///
2673/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2674///
2675/// \param __a
2676/// A 128-bit vector of [4 x float].
2677/// \param __b
2678/// A 128-bit vector of [4 x float].
2679/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2681 __m128 __b) {
2682 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2683}
2684
2685/// Given two 256-bit floating-point vectors of [4 x double], perform an
2686/// element-by-element comparison of the double-precision elements in the
2687/// first source vector and the corresponding elements in the second source
2688/// vector.
2689///
2690/// The EFLAGS register is updated as follows: \n
2691/// If there is at least one pair of double-precision elements where the
2692/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2693/// ZF flag is set to 1. \n
2694/// If there is at least one pair of double-precision elements where the
2695/// sign-bit of the first element is 0 and the sign-bit of the second element
2696/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2697/// This intrinsic returns the value of the ZF flag.
2698///
2699/// \headerfile <x86intrin.h>
2700///
2701/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2702///
2703/// \param __a
2704/// A 256-bit vector of [4 x double].
2705/// \param __b
2706/// A 256-bit vector of [4 x double].
2707/// \returns the ZF flag.
2709 __m256d __b) {
2710 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2711}
2712
2713/// Given two 256-bit floating-point vectors of [4 x double], perform an
2714/// element-by-element comparison of the double-precision elements in the
2715/// first source vector and the corresponding elements in the second source
2716/// vector.
2717///
2718/// The EFLAGS register is updated as follows: \n
2719/// If there is at least one pair of double-precision elements where the
2720/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2721/// ZF flag is set to 1. \n
2722/// If there is at least one pair of double-precision elements where the
2723/// sign-bit of the first element is 0 and the sign-bit of the second element
2724/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2725/// This intrinsic returns the value of the CF flag.
2726///
2727/// \headerfile <x86intrin.h>
2728///
2729/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2730///
2731/// \param __a
2732/// A 256-bit vector of [4 x double].
2733/// \param __b
2734/// A 256-bit vector of [4 x double].
2735/// \returns the CF flag.
2737 __m256d __b) {
2738 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2739}
2740
2741/// Given two 256-bit floating-point vectors of [4 x double], perform an
2742/// element-by-element comparison of the double-precision elements in the
2743/// first source vector and the corresponding elements in the second source
2744/// vector.
2745///
2746/// The EFLAGS register is updated as follows: \n
2747/// If there is at least one pair of double-precision elements where the
2748/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2749/// ZF flag is set to 1. \n
2750/// If there is at least one pair of double-precision elements where the
2751/// sign-bit of the first element is 0 and the sign-bit of the second element
2752/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2753/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2754/// otherwise it returns 0.
2755///
2756/// \headerfile <x86intrin.h>
2757///
2758/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2759///
2760/// \param __a
2761/// A 256-bit vector of [4 x double].
2762/// \param __b
2763/// A 256-bit vector of [4 x double].
2764/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2765static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2766_mm256_testnzc_pd(__m256d __a, __m256d __b) {
2767 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2768}
2769
2770/// Given two 256-bit floating-point vectors of [8 x float], perform an
2771/// element-by-element comparison of the single-precision element in the
2772/// first source vector and the corresponding element in the second source
2773/// vector.
2774///
2775/// The EFLAGS register is updated as follows: \n
2776/// If there is at least one pair of single-precision elements where the
2777/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2778/// ZF flag is set to 1. \n
2779/// If there is at least one pair of single-precision elements where the
2780/// sign-bit of the first element is 0 and the sign-bit of the second element
2781/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2782/// This intrinsic returns the value of the ZF flag.
2783///
2784/// \headerfile <x86intrin.h>
2785///
2786/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2787///
2788/// \param __a
2789/// A 256-bit vector of [8 x float].
2790/// \param __b
2791/// A 256-bit vector of [8 x float].
2792/// \returns the ZF flag.
2794 __m256 __b) {
2795 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2796}
2797
2798/// Given two 256-bit floating-point vectors of [8 x float], perform an
2799/// element-by-element comparison of the single-precision element in the
2800/// first source vector and the corresponding element in the second source
2801/// vector.
2802///
2803/// The EFLAGS register is updated as follows: \n
2804/// If there is at least one pair of single-precision elements where the
2805/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2806/// ZF flag is set to 1. \n
2807/// If there is at least one pair of single-precision elements where the
2808/// sign-bit of the first element is 0 and the sign-bit of the second element
2809/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2810/// This intrinsic returns the value of the CF flag.
2811///
2812/// \headerfile <x86intrin.h>
2813///
2814/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2815///
2816/// \param __a
2817/// A 256-bit vector of [8 x float].
2818/// \param __b
2819/// A 256-bit vector of [8 x float].
2820/// \returns the CF flag.
2822 __m256 __b) {
2823 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2824}
2825
2826/// Given two 256-bit floating-point vectors of [8 x float], perform an
2827/// element-by-element comparison of the single-precision elements in the
2828/// first source vector and the corresponding elements in the second source
2829/// vector.
2830///
2831/// The EFLAGS register is updated as follows: \n
2832/// If there is at least one pair of single-precision elements where the
2833/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2834/// ZF flag is set to 1. \n
2835/// If there is at least one pair of single-precision elements where the
2836/// sign-bit of the first element is 0 and the sign-bit of the second element
2837/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2838/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2839/// otherwise it returns 0.
2840///
2841/// \headerfile <x86intrin.h>
2842///
2843/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2844///
2845/// \param __a
2846/// A 256-bit vector of [8 x float].
2847/// \param __b
2848/// A 256-bit vector of [8 x float].
2849/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2851 __m256 __b) {
2852 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2853}
2854
2855/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2856/// of the two source vectors.
2857///
2858/// The EFLAGS register is updated as follows: \n
2859/// If there is at least one pair of bits where both bits are 1, the ZF flag
2860/// is set to 0. Otherwise the ZF flag is set to 1. \n
2861/// If there is at least one pair of bits where the bit from the first source
2862/// vector is 0 and the bit from the second source vector is 1, the CF flag
2863/// is set to 0. Otherwise the CF flag is set to 1. \n
2864/// This intrinsic returns the value of the ZF flag.
2865///
2866/// \headerfile <x86intrin.h>
2867///
2868/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2869///
2870/// \param __a
2871/// A 256-bit integer vector.
2872/// \param __b
2873/// A 256-bit integer vector.
2874/// \returns the ZF flag.
2875static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2876_mm256_testz_si256(__m256i __a, __m256i __b) {
2877 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2878}
2879
2880/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2881/// of the two source vectors.
2882///
2883/// The EFLAGS register is updated as follows: \n
2884/// If there is at least one pair of bits where both bits are 1, the ZF flag
2885/// is set to 0. Otherwise the ZF flag is set to 1. \n
2886/// If there is at least one pair of bits where the bit from the first source
2887/// vector is 0 and the bit from the second source vector is 1, the CF flag
2888/// is set to 0. Otherwise the CF flag is set to 1. \n
2889/// This intrinsic returns the value of the CF flag.
2890///
2891/// \headerfile <x86intrin.h>
2892///
2893/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2894///
2895/// \param __a
2896/// A 256-bit integer vector.
2897/// \param __b
2898/// A 256-bit integer vector.
2899/// \returns the CF flag.
2900static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2901_mm256_testc_si256(__m256i __a, __m256i __b) {
2902 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2903}
2904
2905/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2906/// of the two source vectors.
2907///
2908/// The EFLAGS register is updated as follows: \n
2909/// If there is at least one pair of bits where both bits are 1, the ZF flag
2910/// is set to 0. Otherwise the ZF flag is set to 1. \n
2911/// If there is at least one pair of bits where the bit from the first source
2912/// vector is 0 and the bit from the second source vector is 1, the CF flag
2913/// is set to 0. Otherwise the CF flag is set to 1. \n
2914/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2915/// otherwise it returns 0.
2916///
2917/// \headerfile <x86intrin.h>
2918///
2919/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2920///
2921/// \param __a
2922/// A 256-bit integer vector.
2923/// \param __b
2924/// A 256-bit integer vector.
2925/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2926static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2927_mm256_testnzc_si256(__m256i __a, __m256i __b) {
2928 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2929}
2930
2931/* Vector extract sign mask */
2932/// Extracts the sign bits of double-precision floating point elements
2933/// in a 256-bit vector of [4 x double] and writes them to the lower order
2934/// bits of the return value.
2935///
2936/// \headerfile <x86intrin.h>
2937///
2938/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2939///
2940/// \param __a
2941/// A 256-bit vector of [4 x double] containing the double-precision
2942/// floating point values with sign bits to be extracted.
2943/// \returns The sign bits from the operand, written to bits [3:0].
2944static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2946 return __builtin_ia32_movmskpd256((__v4df)__a);
2947}
2948
2949/// Extracts the sign bits of single-precision floating point elements
2950/// in a 256-bit vector of [8 x float] and writes them to the lower order
2951/// bits of the return value.
2952///
2953/// \headerfile <x86intrin.h>
2954///
2955/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2956///
2957/// \param __a
2958/// A 256-bit vector of [8 x float] containing the single-precision floating
2959/// point values with sign bits to be extracted.
2960/// \returns The sign bits from the operand, written to bits [7:0].
2961static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2963 return __builtin_ia32_movmskps256((__v8sf)__a);
2964}
2965
2966/* Vector __zero */
2967/// Zeroes the contents of all XMM or YMM registers.
2968///
2969/// \headerfile <x86intrin.h>
2970///
2971/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2972static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2973_mm256_zeroall(void)
2974{
2975 __builtin_ia32_vzeroall();
2976}
2977
2978/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2979///
2980/// \headerfile <x86intrin.h>
2981///
2982/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2983static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2984_mm256_zeroupper(void)
2985{
2986 __builtin_ia32_vzeroupper();
2987}
2988
2989/* Vector load with broadcast */
2990/// Loads a scalar single-precision floating point value from the
2991/// specified address pointed to by \a __a and broadcasts it to the elements
2992/// of a [4 x float] vector.
2993///
2994/// \headerfile <x86intrin.h>
2995///
2996/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2997///
2998/// \param __a
2999/// The single-precision floating point value to be broadcast.
3000/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3001/// equal to the broadcast value.
3002static __inline __m128 __DEFAULT_FN_ATTRS128
3004{
3005 struct __mm_broadcast_ss_struct {
3006 float __f;
3007 } __attribute__((__packed__, __may_alias__));
3008 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3009 return __extension__ (__m128){ __f, __f, __f, __f };
3010}
3011
3012/// Loads a scalar double-precision floating point value from the
3013/// specified address pointed to by \a __a and broadcasts it to the elements
3014/// of a [4 x double] vector.
3015///
3016/// \headerfile <x86intrin.h>
3017///
3018/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3019///
3020/// \param __a
3021/// The double-precision floating point value to be broadcast.
3022/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3023/// equal to the broadcast value.
3024static __inline __m256d __DEFAULT_FN_ATTRS
3026{
3027 struct __mm256_broadcast_sd_struct {
3028 double __d;
3029 } __attribute__((__packed__, __may_alias__));
3030 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3031 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3032}
3033
3034/// Loads a scalar single-precision floating point value from the
3035/// specified address pointed to by \a __a and broadcasts it to the elements
3036/// of a [8 x float] vector.
3037///
3038/// \headerfile <x86intrin.h>
3039///
3040/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3041///
3042/// \param __a
3043/// The single-precision floating point value to be broadcast.
3044/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3045/// equal to the broadcast value.
3046static __inline __m256 __DEFAULT_FN_ATTRS
3048{
3049 struct __mm256_broadcast_ss_struct {
3050 float __f;
3051 } __attribute__((__packed__, __may_alias__));
3052 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3053 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3054}
3055
3056/// Loads the data from a 128-bit vector of [2 x double] from the
3057/// specified address pointed to by \a __a and broadcasts it to 128-bit
3058/// elements in a 256-bit vector of [4 x double].
3059///
3060/// \headerfile <x86intrin.h>
3061///
3062/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3063///
3064/// \param __a
3065/// The 128-bit vector of [2 x double] to be broadcast.
3066/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3067/// equal to the broadcast value.
3068static __inline __m256d __DEFAULT_FN_ATTRS
3070{
3071 __m128d __b = _mm_loadu_pd((const double *)__a);
3072 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3073 0, 1, 0, 1);
3074}
3075
3076/// Loads the data from a 128-bit vector of [4 x float] from the
3077/// specified address pointed to by \a __a and broadcasts it to 128-bit
3078/// elements in a 256-bit vector of [8 x float].
3079///
3080/// \headerfile <x86intrin.h>
3081///
3082/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3083///
3084/// \param __a
3085/// The 128-bit vector of [4 x float] to be broadcast.
3086/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3087/// equal to the broadcast value.
3088static __inline __m256 __DEFAULT_FN_ATTRS
3090{
3091 __m128 __b = _mm_loadu_ps((const float *)__a);
3092 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3093 0, 1, 2, 3, 0, 1, 2, 3);
3094}
3095
3096/* SIMD load ops */
3097/// Loads 4 double-precision floating point values from a 32-byte aligned
3098/// memory location pointed to by \a __p into a vector of [4 x double].
3099///
3100/// \headerfile <x86intrin.h>
3101///
3102/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3103///
3104/// \param __p
3105/// A 32-byte aligned pointer to a memory location containing
3106/// double-precision floating point values.
3107/// \returns A 256-bit vector of [4 x double] containing the moved values.
3108static __inline __m256d __DEFAULT_FN_ATTRS
3109_mm256_load_pd(double const *__p)
3110{
3111 return *(const __m256d *)__p;
3112}
3113
3114/// Loads 8 single-precision floating point values from a 32-byte aligned
3115/// memory location pointed to by \a __p into a vector of [8 x float].
3116///
3117/// \headerfile <x86intrin.h>
3118///
3119/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3120///
3121/// \param __p
3122/// A 32-byte aligned pointer to a memory location containing float values.
3123/// \returns A 256-bit vector of [8 x float] containing the moved values.
3124static __inline __m256 __DEFAULT_FN_ATTRS
3125_mm256_load_ps(float const *__p)
3126{
3127 return *(const __m256 *)__p;
3128}
3129
3130/// Loads 4 double-precision floating point values from an unaligned
3131/// memory location pointed to by \a __p into a vector of [4 x double].
3132///
3133/// \headerfile <x86intrin.h>
3134///
3135/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3136///
3137/// \param __p
3138/// A pointer to a memory location containing double-precision floating
3139/// point values.
3140/// \returns A 256-bit vector of [4 x double] containing the moved values.
3141static __inline __m256d __DEFAULT_FN_ATTRS
3142_mm256_loadu_pd(double const *__p)
3143{
3144 struct __loadu_pd {
3145 __m256d_u __v;
3146 } __attribute__((__packed__, __may_alias__));
3147 return ((const struct __loadu_pd*)__p)->__v;
3148}
3149
3150/// Loads 8 single-precision floating point values from an unaligned
3151/// memory location pointed to by \a __p into a vector of [8 x float].
3152///
3153/// \headerfile <x86intrin.h>
3154///
3155/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3156///
3157/// \param __p
3158/// A pointer to a memory location containing single-precision floating
3159/// point values.
3160/// \returns A 256-bit vector of [8 x float] containing the moved values.
3161static __inline __m256 __DEFAULT_FN_ATTRS
3163{
3164 struct __loadu_ps {
3165 __m256_u __v;
3166 } __attribute__((__packed__, __may_alias__));
3167 return ((const struct __loadu_ps*)__p)->__v;
3168}
3169
3170/// Loads 256 bits of integer data from a 32-byte aligned memory
3171/// location pointed to by \a __p into elements of a 256-bit integer vector.
3172///
3173/// \headerfile <x86intrin.h>
3174///
3175/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3176///
3177/// \param __p
3178/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3179/// values.
3180/// \returns A 256-bit integer vector containing the moved values.
3181static __inline __m256i __DEFAULT_FN_ATTRS
3182_mm256_load_si256(__m256i const *__p)
3183{
3184 return *__p;
3185}
3186
3187/// Loads 256 bits of integer data from an unaligned memory location
3188/// pointed to by \a __p into a 256-bit integer vector.
3189///
3190/// \headerfile <x86intrin.h>
3191///
3192/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3193///
3194/// \param __p
3195/// A pointer to a 256-bit integer vector containing integer values.
3196/// \returns A 256-bit integer vector containing the moved values.
3197static __inline __m256i __DEFAULT_FN_ATTRS
3198_mm256_loadu_si256(__m256i_u const *__p)
3199{
3200 struct __loadu_si256 {
3201 __m256i_u __v;
3202 } __attribute__((__packed__, __may_alias__));
3203 return ((const struct __loadu_si256*)__p)->__v;
3204}
3205
3206/// Loads 256 bits of integer data from an unaligned memory location
3207/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3208/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3209/// line boundary.
3210///
3211/// \headerfile <x86intrin.h>
3212///
3213/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3214///
3215/// \param __p
3216/// A pointer to a 256-bit integer vector containing integer values.
3217/// \returns A 256-bit integer vector containing the moved values.
3218static __inline __m256i __DEFAULT_FN_ATTRS
3219_mm256_lddqu_si256(__m256i_u const *__p)
3220{
3221 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3222}
3223
3224/* SIMD store ops */
3225/// Stores double-precision floating point values from a 256-bit vector
3226/// of [4 x double] to a 32-byte aligned memory location pointed to by
3227/// \a __p.
3228///
3229/// \headerfile <x86intrin.h>
3230///
3231/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3232///
3233/// \param __p
3234/// A 32-byte aligned pointer to a memory location that will receive the
3235/// double-precision floaing point values.
3236/// \param __a
3237/// A 256-bit vector of [4 x double] containing the values to be moved.
3238static __inline void __DEFAULT_FN_ATTRS
3239_mm256_store_pd(double *__p, __m256d __a)
3240{
3241 *(__m256d *)__p = __a;
3242}
3243
3244/// Stores single-precision floating point values from a 256-bit vector
3245/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3246///
3247/// \headerfile <x86intrin.h>
3248///
3249/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3250///
3251/// \param __p
3252/// A 32-byte aligned pointer to a memory location that will receive the
3253/// float values.
3254/// \param __a
3255/// A 256-bit vector of [8 x float] containing the values to be moved.
3256static __inline void __DEFAULT_FN_ATTRS
3257_mm256_store_ps(float *__p, __m256 __a)
3258{
3259 *(__m256 *)__p = __a;
3260}
3261
3262/// Stores double-precision floating point values from a 256-bit vector
3263/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3264///
3265/// \headerfile <x86intrin.h>
3266///
3267/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3268///
3269/// \param __p
3270/// A pointer to a memory location that will receive the double-precision
3271/// floating point values.
3272/// \param __a
3273/// A 256-bit vector of [4 x double] containing the values to be moved.
3274static __inline void __DEFAULT_FN_ATTRS
3275_mm256_storeu_pd(double *__p, __m256d __a)
3276{
3277 struct __storeu_pd {
3278 __m256d_u __v;
3279 } __attribute__((__packed__, __may_alias__));
3280 ((struct __storeu_pd*)__p)->__v = __a;
3281}
3282
3283/// Stores single-precision floating point values from a 256-bit vector
3284/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3285///
3286/// \headerfile <x86intrin.h>
3287///
3288/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3289///
3290/// \param __p
3291/// A pointer to a memory location that will receive the float values.
3292/// \param __a
3293/// A 256-bit vector of [8 x float] containing the values to be moved.
3294static __inline void __DEFAULT_FN_ATTRS
3295_mm256_storeu_ps(float *__p, __m256 __a)
3296{
3297 struct __storeu_ps {
3298 __m256_u __v;
3299 } __attribute__((__packed__, __may_alias__));
3300 ((struct __storeu_ps*)__p)->__v = __a;
3301}
3302
3303/// Stores integer values from a 256-bit integer vector to a 32-byte
3304/// aligned memory location pointed to by \a __p.
3305///
3306/// \headerfile <x86intrin.h>
3307///
3308/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3309///
3310/// \param __p
3311/// A 32-byte aligned pointer to a memory location that will receive the
3312/// integer values.
3313/// \param __a
3314/// A 256-bit integer vector containing the values to be moved.
3315static __inline void __DEFAULT_FN_ATTRS
3316_mm256_store_si256(__m256i *__p, __m256i __a)
3317{
3318 *__p = __a;
3319}
3320
3321/// Stores integer values from a 256-bit integer vector to an unaligned
3322/// memory location pointed to by \a __p.
3323///
3324/// \headerfile <x86intrin.h>
3325///
3326/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3327///
3328/// \param __p
3329/// A pointer to a memory location that will receive the integer values.
3330/// \param __a
3331/// A 256-bit integer vector containing the values to be moved.
3332static __inline void __DEFAULT_FN_ATTRS
3333_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3334{
3335 struct __storeu_si256 {
3336 __m256i_u __v;
3337 } __attribute__((__packed__, __may_alias__));
3338 ((struct __storeu_si256*)__p)->__v = __a;
3339}
3340
3341/* Conditional load ops */
3342/// Conditionally loads double-precision floating point elements from a
3343/// memory location pointed to by \a __p into a 128-bit vector of
3344/// [2 x double], depending on the mask bits associated with each data
3345/// element.
3346///
3347/// \headerfile <x86intrin.h>
3348///
3349/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3350///
3351/// \param __p
3352/// A pointer to a memory location that contains the double-precision
3353/// floating point values.
3354/// \param __m
3355/// A 128-bit integer vector containing the mask. The most significant bit of
3356/// each data element represents the mask bits. If a mask bit is zero, the
3357/// corresponding value in the memory location is not loaded and the
3358/// corresponding field in the return value is set to zero.
3359/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3360static __inline __m128d __DEFAULT_FN_ATTRS128
3361_mm_maskload_pd(double const *__p, __m128i __m)
3362{
3363 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3364}
3365
3366/// Conditionally loads double-precision floating point elements from a
3367/// memory location pointed to by \a __p into a 256-bit vector of
3368/// [4 x double], depending on the mask bits associated with each data
3369/// element.
3370///
3371/// \headerfile <x86intrin.h>
3372///
3373/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3374///
3375/// \param __p
3376/// A pointer to a memory location that contains the double-precision
3377/// floating point values.
3378/// \param __m
3379/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3380/// significant bit of each quadword element represents the mask bits. If a
3381/// mask bit is zero, the corresponding value in the memory location is not
3382/// loaded and the corresponding field in the return value is set to zero.
3383/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3384static __inline __m256d __DEFAULT_FN_ATTRS
3385_mm256_maskload_pd(double const *__p, __m256i __m)
3386{
3387 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3388 (__v4di)__m);
3389}
3390
3391/// Conditionally loads single-precision floating point elements from a
3392/// memory location pointed to by \a __p into a 128-bit vector of
3393/// [4 x float], depending on the mask bits associated with each data
3394/// element.
3395///
3396/// \headerfile <x86intrin.h>
3397///
3398/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3399///
3400/// \param __p
3401/// A pointer to a memory location that contains the single-precision
3402/// floating point values.
3403/// \param __m
3404/// A 128-bit integer vector containing the mask. The most significant bit of
3405/// each data element represents the mask bits. If a mask bit is zero, the
3406/// corresponding value in the memory location is not loaded and the
3407/// corresponding field in the return value is set to zero.
3408/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3409static __inline __m128 __DEFAULT_FN_ATTRS128
3410_mm_maskload_ps(float const *__p, __m128i __m)
3411{
3412 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3413}
3414
3415/// Conditionally loads single-precision floating point elements from a
3416/// memory location pointed to by \a __p into a 256-bit vector of
3417/// [8 x float], depending on the mask bits associated with each data
3418/// element.
3419///
3420/// \headerfile <x86intrin.h>
3421///
3422/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3423///
3424/// \param __p
3425/// A pointer to a memory location that contains the single-precision
3426/// floating point values.
3427/// \param __m
3428/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3429/// significant bit of each dword element represents the mask bits. If a mask
3430/// bit is zero, the corresponding value in the memory location is not loaded
3431/// and the corresponding field in the return value is set to zero.
3432/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3433static __inline __m256 __DEFAULT_FN_ATTRS
3434_mm256_maskload_ps(float const *__p, __m256i __m)
3435{
3436 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3437}
3438
3439/* Conditional store ops */
3440/// Moves single-precision floating point values from a 256-bit vector
3441/// of [8 x float] to a memory location pointed to by \a __p, according to
3442/// the specified mask.
3443///
3444/// \headerfile <x86intrin.h>
3445///
3446/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3447///
3448/// \param __p
3449/// A pointer to a memory location that will receive the float values.
3450/// \param __m
3451/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3452/// significant bit of each dword element in the mask vector represents the
3453/// mask bits. If a mask bit is zero, the corresponding value from vector
3454/// \a __a is not stored and the corresponding field in the memory location
3455/// pointed to by \a __p is not changed.
3456/// \param __a
3457/// A 256-bit vector of [8 x float] containing the values to be stored.
3458static __inline void __DEFAULT_FN_ATTRS
3459_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3460{
3461 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3462}
3463
3464/// Moves double-precision values from a 128-bit vector of [2 x double]
3465/// to a memory location pointed to by \a __p, according to the specified
3466/// mask.
3467///
3468/// \headerfile <x86intrin.h>
3469///
3470/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3471///
3472/// \param __p
3473/// A pointer to a memory location that will receive the float values.
3474/// \param __m
3475/// A 128-bit integer vector containing the mask. The most significant bit of
3476/// each field in the mask vector represents the mask bits. If a mask bit is
3477/// zero, the corresponding value from vector \a __a is not stored and the
3478/// corresponding field in the memory location pointed to by \a __p is not
3479/// changed.
3480/// \param __a
3481/// A 128-bit vector of [2 x double] containing the values to be stored.
3482static __inline void __DEFAULT_FN_ATTRS128
3483_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3484{
3485 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3486}
3487
3488/// Moves double-precision values from a 256-bit vector of [4 x double]
3489/// to a memory location pointed to by \a __p, according to the specified
3490/// mask.
3491///
3492/// \headerfile <x86intrin.h>
3493///
3494/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3495///
3496/// \param __p
3497/// A pointer to a memory location that will receive the float values.
3498/// \param __m
3499/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3500/// significant bit of each quadword element in the mask vector represents
3501/// the mask bits. If a mask bit is zero, the corresponding value from vector
3502/// __a is not stored and the corresponding field in the memory location
3503/// pointed to by \a __p is not changed.
3504/// \param __a
3505/// A 256-bit vector of [4 x double] containing the values to be stored.
3506static __inline void __DEFAULT_FN_ATTRS
3507_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3508{
3509 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3510}
3511
3512/// Moves single-precision floating point values from a 128-bit vector
3513/// of [4 x float] to a memory location pointed to by \a __p, according to
3514/// the specified mask.
3515///
3516/// \headerfile <x86intrin.h>
3517///
3518/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3519///
3520/// \param __p
3521/// A pointer to a memory location that will receive the float values.
3522/// \param __m
3523/// A 128-bit integer vector containing the mask. The most significant bit of
3524/// each field in the mask vector represents the mask bits. If a mask bit is
3525/// zero, the corresponding value from vector __a is not stored and the
3526/// corresponding field in the memory location pointed to by \a __p is not
3527/// changed.
3528/// \param __a
3529/// A 128-bit vector of [4 x float] containing the values to be stored.
3530static __inline void __DEFAULT_FN_ATTRS128
3531_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3532{
3533 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3534}
3535
3536/* Cacheability support ops */
3537/// Moves integer data from a 256-bit integer vector to a 32-byte
3538/// aligned memory location. To minimize caching, the data is flagged as
3539/// non-temporal (unlikely to be used again soon).
3540///
3541/// \headerfile <x86intrin.h>
3542///
3543/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3544///
3545/// \param __a
3546/// A pointer to a 32-byte aligned memory location that will receive the
3547/// integer values.
3548/// \param __b
3549/// A 256-bit integer vector containing the values to be moved.
3550static __inline void __DEFAULT_FN_ATTRS
3552{
3553 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3554 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3555}
3556
3557/// Moves double-precision values from a 256-bit vector of [4 x double]
3558/// to a 32-byte aligned memory location. To minimize caching, the data is
3559/// flagged as non-temporal (unlikely to be used again soon).
3560///
3561/// \headerfile <x86intrin.h>
3562///
3563/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3564///
3565/// \param __a
3566/// A pointer to a 32-byte aligned memory location that will receive the
3567/// double-precision floating-point values.
3568/// \param __b
3569/// A 256-bit vector of [4 x double] containing the values to be moved.
3570static __inline void __DEFAULT_FN_ATTRS
3571_mm256_stream_pd(void *__a, __m256d __b)
3572{
3573 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3574 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3575}
3576
3577/// Moves single-precision floating point values from a 256-bit vector
3578/// of [8 x float] to a 32-byte aligned memory location. To minimize
3579/// caching, the data is flagged as non-temporal (unlikely to be used again
3580/// soon).
3581///
3582/// \headerfile <x86intrin.h>
3583///
3584/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3585///
3586/// \param __p
3587/// A pointer to a 32-byte aligned memory location that will receive the
3588/// single-precision floating point values.
3589/// \param __a
3590/// A 256-bit vector of [8 x float] containing the values to be moved.
3591static __inline void __DEFAULT_FN_ATTRS
3593{
3594 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3595 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3596}
3597
3598/* Create vectors */
3599/// Create a 256-bit vector of [4 x double] with undefined values.
3600///
3601/// \headerfile <x86intrin.h>
3602///
3603/// This intrinsic has no corresponding instruction.
3604///
3605/// \returns A 256-bit vector of [4 x double] containing undefined values.
3606static __inline__ __m256d __DEFAULT_FN_ATTRS
3608{
3609 return (__m256d)__builtin_ia32_undef256();
3610}
3611
3612/// Create a 256-bit vector of [8 x float] with undefined values.
3613///
3614/// \headerfile <x86intrin.h>
3615///
3616/// This intrinsic has no corresponding instruction.
3617///
3618/// \returns A 256-bit vector of [8 x float] containing undefined values.
3619static __inline__ __m256 __DEFAULT_FN_ATTRS
3621{
3622 return (__m256)__builtin_ia32_undef256();
3623}
3624
3625/// Create a 256-bit integer vector with undefined values.
3626///
3627/// \headerfile <x86intrin.h>
3628///
3629/// This intrinsic has no corresponding instruction.
3630///
3631/// \returns A 256-bit integer vector containing undefined values.
3632static __inline__ __m256i __DEFAULT_FN_ATTRS
3634{
3635 return (__m256i)__builtin_ia32_undef256();
3636}
3637
3638/// Constructs a 256-bit floating-point vector of [4 x double]
3639/// initialized with the specified double-precision floating-point values.
3640///
3641/// \headerfile <x86intrin.h>
3642///
3643/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3644/// instruction.
3645///
3646/// \param __a
3647/// A double-precision floating-point value used to initialize bits [255:192]
3648/// of the result.
3649/// \param __b
3650/// A double-precision floating-point value used to initialize bits [191:128]
3651/// of the result.
3652/// \param __c
3653/// A double-precision floating-point value used to initialize bits [127:64]
3654/// of the result.
3655/// \param __d
3656/// A double-precision floating-point value used to initialize bits [63:0]
3657/// of the result.
3658/// \returns An initialized 256-bit floating-point vector of [4 x double].
3659static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3660_mm256_set_pd(double __a, double __b, double __c, double __d)
3661{
3662 return __extension__ (__m256d){ __d, __c, __b, __a };
3663}
3664
3665/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3666/// with the specified single-precision floating-point values.
3667///
3668/// \headerfile <x86intrin.h>
3669///
3670/// This intrinsic is a utility function and does not correspond to a specific
3671/// instruction.
3672///
3673/// \param __a
3674/// A single-precision floating-point value used to initialize bits [255:224]
3675/// of the result.
3676/// \param __b
3677/// A single-precision floating-point value used to initialize bits [223:192]
3678/// of the result.
3679/// \param __c
3680/// A single-precision floating-point value used to initialize bits [191:160]
3681/// of the result.
3682/// \param __d
3683/// A single-precision floating-point value used to initialize bits [159:128]
3684/// of the result.
3685/// \param __e
3686/// A single-precision floating-point value used to initialize bits [127:96]
3687/// of the result.
3688/// \param __f
3689/// A single-precision floating-point value used to initialize bits [95:64]
3690/// of the result.
3691/// \param __g
3692/// A single-precision floating-point value used to initialize bits [63:32]
3693/// of the result.
3694/// \param __h
3695/// A single-precision floating-point value used to initialize bits [31:0]
3696/// of the result.
3697/// \returns An initialized 256-bit floating-point vector of [8 x float].
3698static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3699_mm256_set_ps(float __a, float __b, float __c, float __d,
3700 float __e, float __f, float __g, float __h)
3701{
3702 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3703}
3704
3705/// Constructs a 256-bit integer vector initialized with the specified
3706/// 32-bit integral values.
3707///
3708/// \headerfile <x86intrin.h>
3709///
3710/// This intrinsic is a utility function and does not correspond to a specific
3711/// instruction.
3712///
3713/// \param __i0
3714/// A 32-bit integral value used to initialize bits [255:224] of the result.
3715/// \param __i1
3716/// A 32-bit integral value used to initialize bits [223:192] of the result.
3717/// \param __i2
3718/// A 32-bit integral value used to initialize bits [191:160] of the result.
3719/// \param __i3
3720/// A 32-bit integral value used to initialize bits [159:128] of the result.
3721/// \param __i4
3722/// A 32-bit integral value used to initialize bits [127:96] of the result.
3723/// \param __i5
3724/// A 32-bit integral value used to initialize bits [95:64] of the result.
3725/// \param __i6
3726/// A 32-bit integral value used to initialize bits [63:32] of the result.
3727/// \param __i7
3728/// A 32-bit integral value used to initialize bits [31:0] of the result.
3729/// \returns An initialized 256-bit integer vector.
3730static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3731_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3732 int __i4, int __i5, int __i6, int __i7)
3733{
3734 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3735}
3736
3737/// Constructs a 256-bit integer vector initialized with the specified
3738/// 16-bit integral values.
3739///
3740/// \headerfile <x86intrin.h>
3741///
3742/// This intrinsic is a utility function and does not correspond to a specific
3743/// instruction.
3744///
3745/// \param __w15
3746/// A 16-bit integral value used to initialize bits [255:240] of the result.
3747/// \param __w14
3748/// A 16-bit integral value used to initialize bits [239:224] of the result.
3749/// \param __w13
3750/// A 16-bit integral value used to initialize bits [223:208] of the result.
3751/// \param __w12
3752/// A 16-bit integral value used to initialize bits [207:192] of the result.
3753/// \param __w11
3754/// A 16-bit integral value used to initialize bits [191:176] of the result.
3755/// \param __w10
3756/// A 16-bit integral value used to initialize bits [175:160] of the result.
3757/// \param __w09
3758/// A 16-bit integral value used to initialize bits [159:144] of the result.
3759/// \param __w08
3760/// A 16-bit integral value used to initialize bits [143:128] of the result.
3761/// \param __w07
3762/// A 16-bit integral value used to initialize bits [127:112] of the result.
3763/// \param __w06
3764/// A 16-bit integral value used to initialize bits [111:96] of the result.
3765/// \param __w05
3766/// A 16-bit integral value used to initialize bits [95:80] of the result.
3767/// \param __w04
3768/// A 16-bit integral value used to initialize bits [79:64] of the result.
3769/// \param __w03
3770/// A 16-bit integral value used to initialize bits [63:48] of the result.
3771/// \param __w02
3772/// A 16-bit integral value used to initialize bits [47:32] of the result.
3773/// \param __w01
3774/// A 16-bit integral value used to initialize bits [31:16] of the result.
3775/// \param __w00
3776/// A 16-bit integral value used to initialize bits [15:0] of the result.
3777/// \returns An initialized 256-bit integer vector.
3778static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3779_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3780 short __w11, short __w10, short __w09, short __w08,
3781 short __w07, short __w06, short __w05, short __w04,
3782 short __w03, short __w02, short __w01, short __w00)
3783{
3784 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3785 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3786}
3787
3788/// Constructs a 256-bit integer vector initialized with the specified
3789/// 8-bit integral values.
3790///
3791/// \headerfile <x86intrin.h>
3792///
3793/// This intrinsic is a utility function and does not correspond to a specific
3794/// instruction.
3795///
3796/// \param __b31
3797/// An 8-bit integral value used to initialize bits [255:248] of the result.
3798/// \param __b30
3799/// An 8-bit integral value used to initialize bits [247:240] of the result.
3800/// \param __b29
3801/// An 8-bit integral value used to initialize bits [239:232] of the result.
3802/// \param __b28
3803/// An 8-bit integral value used to initialize bits [231:224] of the result.
3804/// \param __b27
3805/// An 8-bit integral value used to initialize bits [223:216] of the result.
3806/// \param __b26
3807/// An 8-bit integral value used to initialize bits [215:208] of the result.
3808/// \param __b25
3809/// An 8-bit integral value used to initialize bits [207:200] of the result.
3810/// \param __b24
3811/// An 8-bit integral value used to initialize bits [199:192] of the result.
3812/// \param __b23
3813/// An 8-bit integral value used to initialize bits [191:184] of the result.
3814/// \param __b22
3815/// An 8-bit integral value used to initialize bits [183:176] of the result.
3816/// \param __b21
3817/// An 8-bit integral value used to initialize bits [175:168] of the result.
3818/// \param __b20
3819/// An 8-bit integral value used to initialize bits [167:160] of the result.
3820/// \param __b19
3821/// An 8-bit integral value used to initialize bits [159:152] of the result.
3822/// \param __b18
3823/// An 8-bit integral value used to initialize bits [151:144] of the result.
3824/// \param __b17
3825/// An 8-bit integral value used to initialize bits [143:136] of the result.
3826/// \param __b16
3827/// An 8-bit integral value used to initialize bits [135:128] of the result.
3828/// \param __b15
3829/// An 8-bit integral value used to initialize bits [127:120] of the result.
3830/// \param __b14
3831/// An 8-bit integral value used to initialize bits [119:112] of the result.
3832/// \param __b13
3833/// An 8-bit integral value used to initialize bits [111:104] of the result.
3834/// \param __b12
3835/// An 8-bit integral value used to initialize bits [103:96] of the result.
3836/// \param __b11
3837/// An 8-bit integral value used to initialize bits [95:88] of the result.
3838/// \param __b10
3839/// An 8-bit integral value used to initialize bits [87:80] of the result.
3840/// \param __b09
3841/// An 8-bit integral value used to initialize bits [79:72] of the result.
3842/// \param __b08
3843/// An 8-bit integral value used to initialize bits [71:64] of the result.
3844/// \param __b07
3845/// An 8-bit integral value used to initialize bits [63:56] of the result.
3846/// \param __b06
3847/// An 8-bit integral value used to initialize bits [55:48] of the result.
3848/// \param __b05
3849/// An 8-bit integral value used to initialize bits [47:40] of the result.
3850/// \param __b04
3851/// An 8-bit integral value used to initialize bits [39:32] of the result.
3852/// \param __b03
3853/// An 8-bit integral value used to initialize bits [31:24] of the result.
3854/// \param __b02
3855/// An 8-bit integral value used to initialize bits [23:16] of the result.
3856/// \param __b01
3857/// An 8-bit integral value used to initialize bits [15:8] of the result.
3858/// \param __b00
3859/// An 8-bit integral value used to initialize bits [7:0] of the result.
3860/// \returns An initialized 256-bit integer vector.
3861static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3862_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3863 char __b27, char __b26, char __b25, char __b24,
3864 char __b23, char __b22, char __b21, char __b20,
3865 char __b19, char __b18, char __b17, char __b16,
3866 char __b15, char __b14, char __b13, char __b12,
3867 char __b11, char __b10, char __b09, char __b08,
3868 char __b07, char __b06, char __b05, char __b04,
3869 char __b03, char __b02, char __b01, char __b00)
3870{
3871 return __extension__ (__m256i)(__v32qi){
3872 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3873 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3874 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3875 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3876 };
3877}
3878
3879/// Constructs a 256-bit integer vector initialized with the specified
3880/// 64-bit integral values.
3881///
3882/// \headerfile <x86intrin.h>
3883///
3884/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3885/// instruction.
3886///
3887/// \param __a
3888/// A 64-bit integral value used to initialize bits [255:192] of the result.
3889/// \param __b
3890/// A 64-bit integral value used to initialize bits [191:128] of the result.
3891/// \param __c
3892/// A 64-bit integral value used to initialize bits [127:64] of the result.
3893/// \param __d
3894/// A 64-bit integral value used to initialize bits [63:0] of the result.
3895/// \returns An initialized 256-bit integer vector.
3896static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3897_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3898{
3899 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3900}
3901
3902/* Create vectors with elements in reverse order */
3903/// Constructs a 256-bit floating-point vector of [4 x double],
3904/// initialized in reverse order with the specified double-precision
3905/// floating-point values.
3906///
3907/// \headerfile <x86intrin.h>
3908///
3909/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3910/// instruction.
3911///
3912/// \param __a
3913/// A double-precision floating-point value used to initialize bits [63:0]
3914/// of the result.
3915/// \param __b
3916/// A double-precision floating-point value used to initialize bits [127:64]
3917/// of the result.
3918/// \param __c
3919/// A double-precision floating-point value used to initialize bits [191:128]
3920/// of the result.
3921/// \param __d
3922/// A double-precision floating-point value used to initialize bits [255:192]
3923/// of the result.
3924/// \returns An initialized 256-bit floating-point vector of [4 x double].
3925static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3926_mm256_setr_pd(double __a, double __b, double __c, double __d)
3927{
3928 return _mm256_set_pd(__d, __c, __b, __a);
3929}
3930
3931/// Constructs a 256-bit floating-point vector of [8 x float],
3932/// initialized in reverse order with the specified single-precision
3933/// float-point values.
3934///
3935/// \headerfile <x86intrin.h>
3936///
3937/// This intrinsic is a utility function and does not correspond to a specific
3938/// instruction.
3939///
3940/// \param __a
3941/// A single-precision floating-point value used to initialize bits [31:0]
3942/// of the result.
3943/// \param __b
3944/// A single-precision floating-point value used to initialize bits [63:32]
3945/// of the result.
3946/// \param __c
3947/// A single-precision floating-point value used to initialize bits [95:64]
3948/// of the result.
3949/// \param __d
3950/// A single-precision floating-point value used to initialize bits [127:96]
3951/// of the result.
3952/// \param __e
3953/// A single-precision floating-point value used to initialize bits [159:128]
3954/// of the result.
3955/// \param __f
3956/// A single-precision floating-point value used to initialize bits [191:160]
3957/// of the result.
3958/// \param __g
3959/// A single-precision floating-point value used to initialize bits [223:192]
3960/// of the result.
3961/// \param __h
3962/// A single-precision floating-point value used to initialize bits [255:224]
3963/// of the result.
3964/// \returns An initialized 256-bit floating-point vector of [8 x float].
3965static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3966_mm256_setr_ps(float __a, float __b, float __c, float __d,
3967 float __e, float __f, float __g, float __h)
3968{
3969 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3970}
3971
3972/// Constructs a 256-bit integer vector, initialized in reverse order
3973/// with the specified 32-bit integral values.
3974///
3975/// \headerfile <x86intrin.h>
3976///
3977/// This intrinsic is a utility function and does not correspond to a specific
3978/// instruction.
3979///
3980/// \param __i0
3981/// A 32-bit integral value used to initialize bits [31:0] of the result.
3982/// \param __i1
3983/// A 32-bit integral value used to initialize bits [63:32] of the result.
3984/// \param __i2
3985/// A 32-bit integral value used to initialize bits [95:64] of the result.
3986/// \param __i3
3987/// A 32-bit integral value used to initialize bits [127:96] of the result.
3988/// \param __i4
3989/// A 32-bit integral value used to initialize bits [159:128] of the result.
3990/// \param __i5
3991/// A 32-bit integral value used to initialize bits [191:160] of the result.
3992/// \param __i6
3993/// A 32-bit integral value used to initialize bits [223:192] of the result.
3994/// \param __i7
3995/// A 32-bit integral value used to initialize bits [255:224] of the result.
3996/// \returns An initialized 256-bit integer vector.
3997static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3998_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3999 int __i4, int __i5, int __i6, int __i7)
4000{
4001 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
4002}
4003
4004/// Constructs a 256-bit integer vector, initialized in reverse order
4005/// with the specified 16-bit integral values.
4006///
4007/// \headerfile <x86intrin.h>
4008///
4009/// This intrinsic is a utility function and does not correspond to a specific
4010/// instruction.
4011///
4012/// \param __w15
4013/// A 16-bit integral value used to initialize bits [15:0] of the result.
4014/// \param __w14
4015/// A 16-bit integral value used to initialize bits [31:16] of the result.
4016/// \param __w13
4017/// A 16-bit integral value used to initialize bits [47:32] of the result.
4018/// \param __w12
4019/// A 16-bit integral value used to initialize bits [63:48] of the result.
4020/// \param __w11
4021/// A 16-bit integral value used to initialize bits [79:64] of the result.
4022/// \param __w10
4023/// A 16-bit integral value used to initialize bits [95:80] of the result.
4024/// \param __w09
4025/// A 16-bit integral value used to initialize bits [111:96] of the result.
4026/// \param __w08
4027/// A 16-bit integral value used to initialize bits [127:112] of the result.
4028/// \param __w07
4029/// A 16-bit integral value used to initialize bits [143:128] of the result.
4030/// \param __w06
4031/// A 16-bit integral value used to initialize bits [159:144] of the result.
4032/// \param __w05
4033/// A 16-bit integral value used to initialize bits [175:160] of the result.
4034/// \param __w04
4035/// A 16-bit integral value used to initialize bits [191:176] of the result.
4036/// \param __w03
4037/// A 16-bit integral value used to initialize bits [207:192] of the result.
4038/// \param __w02
4039/// A 16-bit integral value used to initialize bits [223:208] of the result.
4040/// \param __w01
4041/// A 16-bit integral value used to initialize bits [239:224] of the result.
4042/// \param __w00
4043/// A 16-bit integral value used to initialize bits [255:240] of the result.
4044/// \returns An initialized 256-bit integer vector.
4045static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4046_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4047 short __w11, short __w10, short __w09, short __w08,
4048 short __w07, short __w06, short __w05, short __w04,
4049 short __w03, short __w02, short __w01, short __w00)
4050{
4051 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4052 __w04, __w05, __w06, __w07,
4053 __w08, __w09, __w10, __w11,
4054 __w12, __w13, __w14, __w15);
4055}
4056
4057/// Constructs a 256-bit integer vector, initialized in reverse order
4058/// with the specified 8-bit integral values.
4059///
4060/// \headerfile <x86intrin.h>
4061///
4062/// This intrinsic is a utility function and does not correspond to a specific
4063/// instruction.
4064///
4065/// \param __b31
4066/// An 8-bit integral value used to initialize bits [7:0] of the result.
4067/// \param __b30
4068/// An 8-bit integral value used to initialize bits [15:8] of the result.
4069/// \param __b29
4070/// An 8-bit integral value used to initialize bits [23:16] of the result.
4071/// \param __b28
4072/// An 8-bit integral value used to initialize bits [31:24] of the result.
4073/// \param __b27
4074/// An 8-bit integral value used to initialize bits [39:32] of the result.
4075/// \param __b26
4076/// An 8-bit integral value used to initialize bits [47:40] of the result.
4077/// \param __b25
4078/// An 8-bit integral value used to initialize bits [55:48] of the result.
4079/// \param __b24
4080/// An 8-bit integral value used to initialize bits [63:56] of the result.
4081/// \param __b23
4082/// An 8-bit integral value used to initialize bits [71:64] of the result.
4083/// \param __b22
4084/// An 8-bit integral value used to initialize bits [79:72] of the result.
4085/// \param __b21
4086/// An 8-bit integral value used to initialize bits [87:80] of the result.
4087/// \param __b20
4088/// An 8-bit integral value used to initialize bits [95:88] of the result.
4089/// \param __b19
4090/// An 8-bit integral value used to initialize bits [103:96] of the result.
4091/// \param __b18
4092/// An 8-bit integral value used to initialize bits [111:104] of the result.
4093/// \param __b17
4094/// An 8-bit integral value used to initialize bits [119:112] of the result.
4095/// \param __b16
4096/// An 8-bit integral value used to initialize bits [127:120] of the result.
4097/// \param __b15
4098/// An 8-bit integral value used to initialize bits [135:128] of the result.
4099/// \param __b14
4100/// An 8-bit integral value used to initialize bits [143:136] of the result.
4101/// \param __b13
4102/// An 8-bit integral value used to initialize bits [151:144] of the result.
4103/// \param __b12
4104/// An 8-bit integral value used to initialize bits [159:152] of the result.
4105/// \param __b11
4106/// An 8-bit integral value used to initialize bits [167:160] of the result.
4107/// \param __b10
4108/// An 8-bit integral value used to initialize bits [175:168] of the result.
4109/// \param __b09
4110/// An 8-bit integral value used to initialize bits [183:176] of the result.
4111/// \param __b08
4112/// An 8-bit integral value used to initialize bits [191:184] of the result.
4113/// \param __b07
4114/// An 8-bit integral value used to initialize bits [199:192] of the result.
4115/// \param __b06
4116/// An 8-bit integral value used to initialize bits [207:200] of the result.
4117/// \param __b05
4118/// An 8-bit integral value used to initialize bits [215:208] of the result.
4119/// \param __b04
4120/// An 8-bit integral value used to initialize bits [223:216] of the result.
4121/// \param __b03
4122/// An 8-bit integral value used to initialize bits [231:224] of the result.
4123/// \param __b02
4124/// An 8-bit integral value used to initialize bits [239:232] of the result.
4125/// \param __b01
4126/// An 8-bit integral value used to initialize bits [247:240] of the result.
4127/// \param __b00
4128/// An 8-bit integral value used to initialize bits [255:248] of the result.
4129/// \returns An initialized 256-bit integer vector.
4130static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4131_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4132 char __b27, char __b26, char __b25, char __b24,
4133 char __b23, char __b22, char __b21, char __b20,
4134 char __b19, char __b18, char __b17, char __b16,
4135 char __b15, char __b14, char __b13, char __b12,
4136 char __b11, char __b10, char __b09, char __b08,
4137 char __b07, char __b06, char __b05, char __b04,
4138 char __b03, char __b02, char __b01, char __b00)
4139{
4140 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4141 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4142 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4143 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4144}
4145
4146/// Constructs a 256-bit integer vector, initialized in reverse order
4147/// with the specified 64-bit integral values.
4148///
4149/// \headerfile <x86intrin.h>
4150///
4151/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4152/// instruction.
4153///
4154/// \param __a
4155/// A 64-bit integral value used to initialize bits [63:0] of the result.
4156/// \param __b
4157/// A 64-bit integral value used to initialize bits [127:64] of the result.
4158/// \param __c
4159/// A 64-bit integral value used to initialize bits [191:128] of the result.
4160/// \param __d
4161/// A 64-bit integral value used to initialize bits [255:192] of the result.
4162/// \returns An initialized 256-bit integer vector.
4163static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4164_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4165{
4166 return _mm256_set_epi64x(__d, __c, __b, __a);
4167}
4168
4169/* Create vectors with repeated elements */
4170/// Constructs a 256-bit floating-point vector of [4 x double], with each
4171/// of the four double-precision floating-point vector elements set to the
4172/// specified double-precision floating-point value.
4173///
4174/// \headerfile <x86intrin.h>
4175///
4176/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4177///
4178/// \param __w
4179/// A double-precision floating-point value used to initialize each vector
4180/// element of the result.
4181/// \returns An initialized 256-bit floating-point vector of [4 x double].
4182static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4184{
4185 return _mm256_set_pd(__w, __w, __w, __w);
4186}
4187
4188/// Constructs a 256-bit floating-point vector of [8 x float], with each
4189/// of the eight single-precision floating-point vector elements set to the
4190/// specified single-precision floating-point value.
4191///
4192/// \headerfile <x86intrin.h>
4193///
4194/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4195/// instruction.
4196///
4197/// \param __w
4198/// A single-precision floating-point value used to initialize each vector
4199/// element of the result.
4200/// \returns An initialized 256-bit floating-point vector of [8 x float].
4201static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4203{
4204 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4205}
4206
4207/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4208/// 32-bit integral vector elements set to the specified 32-bit integral
4209/// value.
4210///
4211/// \headerfile <x86intrin.h>
4212///
4213/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4214/// instruction.
4215///
4216/// \param __i
4217/// A 32-bit integral value used to initialize each vector element of the
4218/// result.
4219/// \returns An initialized 256-bit integer vector of [8 x i32].
4220static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4222{
4223 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4224}
4225
4226/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4227/// 16-bit integral vector elements set to the specified 16-bit integral
4228/// value.
4229///
4230/// \headerfile <x86intrin.h>
4231///
4232/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4233///
4234/// \param __w
4235/// A 16-bit integral value used to initialize each vector element of the
4236/// result.
4237/// \returns An initialized 256-bit integer vector of [16 x i16].
4238static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4240{
4241 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4242 __w, __w, __w, __w, __w, __w, __w, __w);
4243}
4244
4245/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4246/// 8-bit integral vector elements set to the specified 8-bit integral value.
4247///
4248/// \headerfile <x86intrin.h>
4249///
4250/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4251///
4252/// \param __b
4253/// An 8-bit integral value used to initialize each vector element of the
4254/// result.
4255/// \returns An initialized 256-bit integer vector of [32 x i8].
4256static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4258{
4259 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4260 __b, __b, __b, __b, __b, __b, __b, __b,
4261 __b, __b, __b, __b, __b, __b, __b, __b,
4262 __b, __b, __b, __b, __b, __b, __b, __b);
4263}
4264
4265/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4266/// 64-bit integral vector elements set to the specified 64-bit integral
4267/// value.
4268///
4269/// \headerfile <x86intrin.h>
4270///
4271/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4272///
4273/// \param __q
4274/// A 64-bit integral value used to initialize each vector element of the
4275/// result.
4276/// \returns An initialized 256-bit integer vector of [4 x i64].
4277static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4279{
4280 return _mm256_set_epi64x(__q, __q, __q, __q);
4281}
4282
4283/* Create __zeroed vectors */
4284/// Constructs a 256-bit floating-point vector of [4 x double] with all
4285/// vector elements initialized to zero.
4286///
4287/// \headerfile <x86intrin.h>
4288///
4289/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4290///
4291/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4293 return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
4294}
4295
4296/// Constructs a 256-bit floating-point vector of [8 x float] with all
4297/// vector elements initialized to zero.
4298///
4299/// \headerfile <x86intrin.h>
4300///
4301/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4302///
4303/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4305 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4306}
4307
4308/// Constructs a 256-bit integer vector initialized to zero.
4309///
4310/// \headerfile <x86intrin.h>
4311///
4312/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4313///
4314/// \returns A 256-bit integer vector initialized to zero.
4315static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4317 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4318}
4319
4320/* Cast between vector types */
4321/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4322/// floating-point vector of [8 x float].
4323///
4324/// \headerfile <x86intrin.h>
4325///
4326/// This intrinsic has no corresponding instruction.
4327///
4328/// \param __a
4329/// A 256-bit floating-point vector of [4 x double].
4330/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4331/// bitwise pattern as the parameter.
4332static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4334{
4335 return (__m256)__a;
4336}
4337
4338/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4339/// integer vector.
4340///
4341/// \headerfile <x86intrin.h>
4342///
4343/// This intrinsic has no corresponding instruction.
4344///
4345/// \param __a
4346/// A 256-bit floating-point vector of [4 x double].
4347/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4348/// parameter.
4349static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4351{
4352 return (__m256i)__a;
4353}
4354
4355/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4356/// floating-point vector of [4 x double].
4357///
4358/// \headerfile <x86intrin.h>
4359///
4360/// This intrinsic has no corresponding instruction.
4361///
4362/// \param __a
4363/// A 256-bit floating-point vector of [8 x float].
4364/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4365/// bitwise pattern as the parameter.
4366static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4368{
4369 return (__m256d)__a;
4370}
4371
4372/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4373/// integer vector.
4374///
4375/// \headerfile <x86intrin.h>
4376///
4377/// This intrinsic has no corresponding instruction.
4378///
4379/// \param __a
4380/// A 256-bit floating-point vector of [8 x float].
4381/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4382/// parameter.
4383static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4385{
4386 return (__m256i)__a;
4387}
4388
4389/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4390/// of [8 x float].
4391///
4392/// \headerfile <x86intrin.h>
4393///
4394/// This intrinsic has no corresponding instruction.
4395///
4396/// \param __a
4397/// A 256-bit integer vector.
4398/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4399/// bitwise pattern as the parameter.
4400static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4402{
4403 return (__m256)__a;
4404}
4405
4406/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4407/// of [4 x double].
4408///
4409/// \headerfile <x86intrin.h>
4410///
4411/// This intrinsic has no corresponding instruction.
4412///
4413/// \param __a
4414/// A 256-bit integer vector.
4415/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4416/// bitwise pattern as the parameter.
4417static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4419{
4420 return (__m256d)__a;
4421}
4422
4423/// Returns the lower 128 bits of a 256-bit floating-point vector of
4424/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4425///
4426/// \headerfile <x86intrin.h>
4427///
4428/// This intrinsic has no corresponding instruction.
4429///
4430/// \param __a
4431/// A 256-bit floating-point vector of [4 x double].
4432/// \returns A 128-bit floating-point vector of [2 x double] containing the
4433/// lower 128 bits of the parameter.
4434static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4436{
4437 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4438}
4439
4440/// Returns the lower 128 bits of a 256-bit floating-point vector of
4441/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4442///
4443/// \headerfile <x86intrin.h>
4444///
4445/// This intrinsic has no corresponding instruction.
4446///
4447/// \param __a
4448/// A 256-bit floating-point vector of [8 x float].
4449/// \returns A 128-bit floating-point vector of [4 x float] containing the
4450/// lower 128 bits of the parameter.
4451static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4453{
4454 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4455}
4456
4457/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4458///
4459/// \headerfile <x86intrin.h>
4460///
4461/// This intrinsic has no corresponding instruction.
4462///
4463/// \param __a
4464/// A 256-bit integer vector.
4465/// \returns A 128-bit integer vector containing the lower 128 bits of the
4466/// parameter.
4467static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4469{
4470 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4471}
4472
4473/// Constructs a 256-bit floating-point vector of [4 x double] from a
4474/// 128-bit floating-point vector of [2 x double].
4475///
4476/// The lower 128 bits contain the value of the source vector. The contents
4477/// of the upper 128 bits are undefined.
4478///
4479/// \headerfile <x86intrin.h>
4480///
4481/// This intrinsic has no corresponding instruction.
4482///
4483/// \param __a
4484/// A 128-bit vector of [2 x double].
4485/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4486/// contain the value of the parameter. The contents of the upper 128 bits
4487/// are undefined.
4488static __inline __m256d __DEFAULT_FN_ATTRS
4490{
4491 return __builtin_shufflevector(
4492 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4493}
4494
4495/// Constructs a 256-bit floating-point vector of [8 x float] from a
4496/// 128-bit floating-point vector of [4 x float].
4497///
4498/// The lower 128 bits contain the value of the source vector. The contents
4499/// of the upper 128 bits are undefined.
4500///
4501/// \headerfile <x86intrin.h>
4502///
4503/// This intrinsic has no corresponding instruction.
4504///
4505/// \param __a
4506/// A 128-bit vector of [4 x float].
4507/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4508/// contain the value of the parameter. The contents of the upper 128 bits
4509/// are undefined.
4510static __inline __m256 __DEFAULT_FN_ATTRS
4512{
4513 return __builtin_shufflevector((__v4sf)__a,
4514 (__v4sf)__builtin_nondeterministic_value(__a),
4515 0, 1, 2, 3, 4, 5, 6, 7);
4516}
4517
4518/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4519///
4520/// The lower 128 bits contain the value of the source vector. The contents
4521/// of the upper 128 bits are undefined.
4522///
4523/// \headerfile <x86intrin.h>
4524///
4525/// This intrinsic has no corresponding instruction.
4526///
4527/// \param __a
4528/// A 128-bit integer vector.
4529/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4530/// the parameter. The contents of the upper 128 bits are undefined.
4531static __inline __m256i __DEFAULT_FN_ATTRS
4533{
4534 return __builtin_shufflevector(
4535 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4536}
4537
4538/// Constructs a 256-bit floating-point vector of [4 x double] from a
4539/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4540/// contain the value of the source vector. The upper 128 bits are set
4541/// to zero.
4542///
4543/// \headerfile <x86intrin.h>
4544///
4545/// This intrinsic has no corresponding instruction.
4546///
4547/// \param __a
4548/// A 128-bit vector of [2 x double].
4549/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4550/// contain the value of the parameter. The upper 128 bits are set to zero.
4551static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4553 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4554}
4555
4556/// Constructs a 256-bit floating-point vector of [8 x float] from a
4557/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4558/// the value of the source vector. The upper 128 bits are set to zero.
4559///
4560/// \headerfile <x86intrin.h>
4561///
4562/// This intrinsic has no corresponding instruction.
4563///
4564/// \param __a
4565/// A 128-bit vector of [4 x float].
4566/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4567/// contain the value of the parameter. The upper 128 bits are set to zero.
4568static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4570 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4571}
4572
4573/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4574/// The lower 128 bits contain the value of the source vector. The upper
4575/// 128 bits are set to zero.
4576///
4577/// \headerfile <x86intrin.h>
4578///
4579/// This intrinsic has no corresponding instruction.
4580///
4581/// \param __a
4582/// A 128-bit integer vector.
4583/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4584/// the parameter. The upper 128 bits are set to zero.
4585static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4587 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4588}
4589
4590/*
4591 Vector insert.
4592 We use macros rather than inlines because we only want to accept
4593 invocations where the immediate M is a constant expression.
4594*/
4595/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4596/// a 256-bit vector of [8 x float] given in the first parameter, and then
4597/// replacing either the upper or the lower 128 bits with the contents of a
4598/// 128-bit vector of [4 x float] in the second parameter.
4599///
4600/// The immediate integer parameter determines between the upper or the lower
4601/// 128 bits.
4602///
4603/// \headerfile <x86intrin.h>
4604///
4605/// \code
4606/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4607/// \endcode
4608///
4609/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4610///
4611/// \param V1
4612/// A 256-bit vector of [8 x float]. This vector is copied to the result
4613/// first, and then either the upper or the lower 128 bits of the result will
4614/// be replaced by the contents of \a V2.
4615/// \param V2
4616/// A 128-bit vector of [4 x float]. The contents of this parameter are
4617/// written to either the upper or the lower 128 bits of the result depending
4618/// on the value of parameter \a M.
4619/// \param M
4620/// An immediate integer. The least significant bit determines how the values
4621/// from the two parameters are interleaved: \n
4622/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4623/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4624/// result. \n
4625/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4626/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4627/// result.
4628/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4629#define _mm256_insertf128_ps(V1, V2, M) \
4630 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4631 (__v4sf)(__m128)(V2), (int)(M)))
4632
4633/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4634/// a 256-bit vector of [4 x double] given in the first parameter, and then
4635/// replacing either the upper or the lower 128 bits with the contents of a
4636/// 128-bit vector of [2 x double] in the second parameter.
4637///
4638/// The immediate integer parameter determines between the upper or the lower
4639/// 128 bits.
4640///
4641/// \headerfile <x86intrin.h>
4642///
4643/// \code
4644/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4645/// \endcode
4646///
4647/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4648///
4649/// \param V1
4650/// A 256-bit vector of [4 x double]. This vector is copied to the result
4651/// first, and then either the upper or the lower 128 bits of the result will
4652/// be replaced by the contents of \a V2.
4653/// \param V2
4654/// A 128-bit vector of [2 x double]. The contents of this parameter are
4655/// written to either the upper or the lower 128 bits of the result depending
4656/// on the value of parameter \a M.
4657/// \param M
4658/// An immediate integer. The least significant bit determines how the values
4659/// from the two parameters are interleaved: \n
4660/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4661/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4662/// result. \n
4663/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4664/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4665/// result.
4666/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4667#define _mm256_insertf128_pd(V1, V2, M) \
4668 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4669 (__v2df)(__m128d)(V2), (int)(M)))
4670
4671/// Constructs a new 256-bit integer vector by first duplicating a
4672/// 256-bit integer vector given in the first parameter, and then replacing
4673/// either the upper or the lower 128 bits with the contents of a 128-bit
4674/// integer vector in the second parameter.
4675///
4676/// The immediate integer parameter determines between the upper or the lower
4677/// 128 bits.
4678///
4679/// \headerfile <x86intrin.h>
4680///
4681/// \code
4682/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4683/// \endcode
4684///
4685/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4686///
4687/// \param V1
4688/// A 256-bit integer vector. This vector is copied to the result first, and
4689/// then either the upper or the lower 128 bits of the result will be
4690/// replaced by the contents of \a V2.
4691/// \param V2
4692/// A 128-bit integer vector. The contents of this parameter are written to
4693/// either the upper or the lower 128 bits of the result depending on the
4694/// value of parameter \a M.
4695/// \param M
4696/// An immediate integer. The least significant bit determines how the values
4697/// from the two parameters are interleaved: \n
4698/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4699/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4700/// result. \n
4701/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4702/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4703/// result.
4704/// \returns A 256-bit integer vector containing the interleaved values.
4705#define _mm256_insertf128_si256(V1, V2, M) \
4706 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4707 (__v4si)(__m128i)(V2), (int)(M)))
4708
4709/*
4710 Vector extract.
4711 We use macros rather than inlines because we only want to accept
4712 invocations where the immediate M is a constant expression.
4713*/
4714/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4715/// of [8 x float], as determined by the immediate integer parameter, and
4716/// returns the extracted bits as a 128-bit vector of [4 x float].
4717///
4718/// \headerfile <x86intrin.h>
4719///
4720/// \code
4721/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4722/// \endcode
4723///
4724/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4725///
4726/// \param V
4727/// A 256-bit vector of [8 x float].
4728/// \param M
4729/// An immediate integer. The least significant bit determines which bits are
4730/// extracted from the first parameter: \n
4731/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4732/// result. \n
4733/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4734/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4735#define _mm256_extractf128_ps(V, M) \
4736 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4737
4738/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4739/// of [4 x double], as determined by the immediate integer parameter, and
4740/// returns the extracted bits as a 128-bit vector of [2 x double].
4741///
4742/// \headerfile <x86intrin.h>
4743///
4744/// \code
4745/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4746/// \endcode
4747///
4748/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4749///
4750/// \param V
4751/// A 256-bit vector of [4 x double].
4752/// \param M
4753/// An immediate integer. The least significant bit determines which bits are
4754/// extracted from the first parameter: \n
4755/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4756/// result. \n
4757/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4758/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4759#define _mm256_extractf128_pd(V, M) \
4760 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4761
4762/// Extracts either the upper or the lower 128 bits from a 256-bit
4763/// integer vector, as determined by the immediate integer parameter, and
4764/// returns the extracted bits as a 128-bit integer vector.
4765///
4766/// \headerfile <x86intrin.h>
4767///
4768/// \code
4769/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4770/// \endcode
4771///
4772/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4773///
4774/// \param V
4775/// A 256-bit integer vector.
4776/// \param M
4777/// An immediate integer. The least significant bit determines which bits are
4778/// extracted from the first parameter: \n
4779/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4780/// result. \n
4781/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4782/// \returns A 128-bit integer vector containing the extracted bits.
4783#define _mm256_extractf128_si256(V, M) \
4784 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4785
4786/// Constructs a 256-bit floating-point vector of [8 x float] by
4787/// concatenating two 128-bit floating-point vectors of [4 x float].
4788///
4789/// \headerfile <x86intrin.h>
4790///
4791/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4792///
4793/// \param __hi
4794/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4795/// 128 bits of the result.
4796/// \param __lo
4797/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4798/// 128 bits of the result.
4799/// \returns A 256-bit floating-point vector of [8 x float] containing the
4800/// concatenated result.
4801static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4802_mm256_set_m128(__m128 __hi, __m128 __lo) {
4803 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4804}
4805
4806/// Constructs a 256-bit floating-point vector of [4 x double] by
4807/// concatenating two 128-bit floating-point vectors of [2 x double].
4808///
4809/// \headerfile <x86intrin.h>
4810///
4811/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4812///
4813/// \param __hi
4814/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4815/// 128 bits of the result.
4816/// \param __lo
4817/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4818/// 128 bits of the result.
4819/// \returns A 256-bit floating-point vector of [4 x double] containing the
4820/// concatenated result.
4821static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4822_mm256_set_m128d(__m128d __hi, __m128d __lo) {
4823 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4824}
4825
4826/// Constructs a 256-bit integer vector by concatenating two 128-bit
4827/// integer vectors.
4828///
4829/// \headerfile <x86intrin.h>
4830///
4831/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4832///
4833/// \param __hi
4834/// A 128-bit integer vector to be copied to the upper 128 bits of the
4835/// result.
4836/// \param __lo
4837/// A 128-bit integer vector to be copied to the lower 128 bits of the
4838/// result.
4839/// \returns A 256-bit integer vector containing the concatenated result.
4840static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4841_mm256_set_m128i(__m128i __hi, __m128i __lo) {
4842 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4843}
4844
4845/// Constructs a 256-bit floating-point vector of [8 x float] by
4846/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4847/// similar to _mm256_set_m128, but the order of the input parameters is
4848/// swapped.
4849///
4850/// \headerfile <x86intrin.h>
4851///
4852/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4853///
4854/// \param __lo
4855/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4856/// 128 bits of the result.
4857/// \param __hi
4858/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4859/// 128 bits of the result.
4860/// \returns A 256-bit floating-point vector of [8 x float] containing the
4861/// concatenated result.
4862static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4863_mm256_setr_m128(__m128 __lo, __m128 __hi) {
4864 return _mm256_set_m128(__hi, __lo);
4865}
4866
4867/// Constructs a 256-bit floating-point vector of [4 x double] by
4868/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4869/// similar to _mm256_set_m128d, but the order of the input parameters is
4870/// swapped.
4871///
4872/// \headerfile <x86intrin.h>
4873///
4874/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4875///
4876/// \param __lo
4877/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4878/// 128 bits of the result.
4879/// \param __hi
4880/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4881/// 128 bits of the result.
4882/// \returns A 256-bit floating-point vector of [4 x double] containing the
4883/// concatenated result.
4884static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4885_mm256_setr_m128d(__m128d __lo, __m128d __hi) {
4886 return (__m256d)_mm256_set_m128d(__hi, __lo);
4887}
4888
4889/// Constructs a 256-bit integer vector by concatenating two 128-bit
4890/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4891/// the input parameters is swapped.
4892///
4893/// \headerfile <x86intrin.h>
4894///
4895/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4896///
4897/// \param __lo
4898/// A 128-bit integer vector to be copied to the lower 128 bits of the
4899/// result.
4900/// \param __hi
4901/// A 128-bit integer vector to be copied to the upper 128 bits of the
4902/// result.
4903/// \returns A 256-bit integer vector containing the concatenated result.
4904static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4905_mm256_setr_m128i(__m128i __lo, __m128i __hi) {
4906 return (__m256i)_mm256_set_m128i(__hi, __lo);
4907}
4908
4909/* SIMD load ops (unaligned) */
4910/// Loads two 128-bit floating-point vectors of [4 x float] from
4911/// unaligned memory locations and constructs a 256-bit floating-point vector
4912/// of [8 x float] by concatenating the two 128-bit vectors.
4913///
4914/// \headerfile <x86intrin.h>
4915///
4916/// This intrinsic corresponds to load instructions followed by the
4917/// <c> VINSERTF128 </c> instruction.
4918///
4919/// \param __addr_hi
4920/// A pointer to a 128-bit memory location containing 4 consecutive
4921/// single-precision floating-point values. These values are to be copied to
4922/// bits[255:128] of the result. The address of the memory location does not
4923/// have to be aligned.
4924/// \param __addr_lo
4925/// A pointer to a 128-bit memory location containing 4 consecutive
4926/// single-precision floating-point values. These values are to be copied to
4927/// bits[127:0] of the result. The address of the memory location does not
4928/// have to be aligned.
4929/// \returns A 256-bit floating-point vector of [8 x float] containing the
4930/// concatenated result.
4931static __inline __m256 __DEFAULT_FN_ATTRS
4932_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4933{
4934 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4935}
4936
4937/// Loads two 128-bit floating-point vectors of [2 x double] from
4938/// unaligned memory locations and constructs a 256-bit floating-point vector
4939/// of [4 x double] by concatenating the two 128-bit vectors.
4940///
4941/// \headerfile <x86intrin.h>
4942///
4943/// This intrinsic corresponds to load instructions followed by the
4944/// <c> VINSERTF128 </c> instruction.
4945///
4946/// \param __addr_hi
4947/// A pointer to a 128-bit memory location containing two consecutive
4948/// double-precision floating-point values. These values are to be copied to
4949/// bits[255:128] of the result. The address of the memory location does not
4950/// have to be aligned.
4951/// \param __addr_lo
4952/// A pointer to a 128-bit memory location containing two consecutive
4953/// double-precision floating-point values. These values are to be copied to
4954/// bits[127:0] of the result. The address of the memory location does not
4955/// have to be aligned.
4956/// \returns A 256-bit floating-point vector of [4 x double] containing the
4957/// concatenated result.
4958static __inline __m256d __DEFAULT_FN_ATTRS
4959_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4960{
4961 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
4962}
4963
4964/// Loads two 128-bit integer vectors from unaligned memory locations and
4965/// constructs a 256-bit integer vector by concatenating the two 128-bit
4966/// vectors.
4967///
4968/// \headerfile <x86intrin.h>
4969///
4970/// This intrinsic corresponds to load instructions followed by the
4971/// <c> VINSERTF128 </c> instruction.
4972///
4973/// \param __addr_hi
4974/// A pointer to a 128-bit memory location containing a 128-bit integer
4975/// vector. This vector is to be copied to bits[255:128] of the result. The
4976/// address of the memory location does not have to be aligned.
4977/// \param __addr_lo
4978/// A pointer to a 128-bit memory location containing a 128-bit integer
4979/// vector. This vector is to be copied to bits[127:0] of the result. The
4980/// address of the memory location does not have to be aligned.
4981/// \returns A 256-bit integer vector containing the concatenated result.
4982static __inline __m256i __DEFAULT_FN_ATTRS
4983_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
4984{
4985 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
4986}
4987
4988/* SIMD store ops (unaligned) */
4989/// Stores the upper and lower 128 bits of a 256-bit floating-point
4990/// vector of [8 x float] into two different unaligned memory locations.
4991///
4992/// \headerfile <x86intrin.h>
4993///
4994/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4995/// store instructions.
4996///
4997/// \param __addr_hi
4998/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4999/// copied to this memory location. The address of this memory location does
5000/// not have to be aligned.
5001/// \param __addr_lo
5002/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5003/// copied to this memory location. The address of this memory location does
5004/// not have to be aligned.
5005/// \param __a
5006/// A 256-bit floating-point vector of [8 x float].
5007static __inline void __DEFAULT_FN_ATTRS
5008_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
5009{
5010 __m128 __v128;
5011
5012 __v128 = _mm256_castps256_ps128(__a);
5013 _mm_storeu_ps(__addr_lo, __v128);
5014 __v128 = _mm256_extractf128_ps(__a, 1);
5015 _mm_storeu_ps(__addr_hi, __v128);
5016}
5017
5018/// Stores the upper and lower 128 bits of a 256-bit floating-point
5019/// vector of [4 x double] into two different unaligned memory locations.
5020///
5021/// \headerfile <x86intrin.h>
5022///
5023/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5024/// store instructions.
5025///
5026/// \param __addr_hi
5027/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5028/// copied to this memory location. The address of this memory location does
5029/// not have to be aligned.
5030/// \param __addr_lo
5031/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5032/// copied to this memory location. The address of this memory location does
5033/// not have to be aligned.
5034/// \param __a
5035/// A 256-bit floating-point vector of [4 x double].
5036static __inline void __DEFAULT_FN_ATTRS
5037_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5038{
5039 __m128d __v128;
5040
5041 __v128 = _mm256_castpd256_pd128(__a);
5042 _mm_storeu_pd(__addr_lo, __v128);
5043 __v128 = _mm256_extractf128_pd(__a, 1);
5044 _mm_storeu_pd(__addr_hi, __v128);
5045}
5046
5047/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5048/// two different unaligned memory locations.
5049///
5050/// \headerfile <x86intrin.h>
5051///
5052/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5053/// store instructions.
5054///
5055/// \param __addr_hi
5056/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5057/// copied to this memory location. The address of this memory location does
5058/// not have to be aligned.
5059/// \param __addr_lo
5060/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5061/// copied to this memory location. The address of this memory location does
5062/// not have to be aligned.
5063/// \param __a
5064/// A 256-bit integer vector.
5065static __inline void __DEFAULT_FN_ATTRS
5066_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5067{
5068 __m128i __v128;
5069
5070 __v128 = _mm256_castsi256_si128(__a);
5071 _mm_storeu_si128(__addr_lo, __v128);
5072 __v128 = _mm256_extractf128_si256(__a, 1);
5073 _mm_storeu_si128(__addr_hi, __v128);
5074}
5075
5076#undef __DEFAULT_FN_ATTRS
5077#undef __DEFAULT_FN_ATTRS_CONSTEXPR
5078#undef __DEFAULT_FN_ATTRS128
5079#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5080
5081#endif /* __AVXINTRIN_H */
__device__ _Float16
#define __DEFAULT_FN_ATTRS
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3025
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition avxintrin.h:3069
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition avxintrin.h:3275
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition avxintrin.h:3571
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition avxintrin.h:2945
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition avxintrin.h:3089
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4552
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition avxintrin.h:2275
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition avxintrin.h:3239
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition avxintrin.h:3295
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition avxintrin.h:2962
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition avxintrin.h:4932
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:356
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3385
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition avxintrin.h:581
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2596
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition avxintrin.h:3731
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4569
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2680
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition avxintrin.h:116
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:390
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition avxintrin.h:2200
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition avxintrin.h:3620
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition avxintrin.h:306
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition avxintrin.h:3966
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4863
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3410
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3361
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition avxintrin.h:978
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition avxintrin.h:4350
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition avxintrin.h:188
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4905
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition avxintrin.h:3333
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition avxintrin.h:4735
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition avxintrin.h:4783
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition avxintrin.h:3182
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition avxintrin.h:4401
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition avxintrin.h:4333
static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition avxintrin.h:2342
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition avxintrin.h:3660
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition avxintrin.h:2367
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4885
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition avxintrin.h:132
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition avxintrin.h:1402
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition avxintrin.h:3551
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition avxintrin.h:887
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition avxintrin.h:3607
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:763
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:373
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition avxintrin.h:3779
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition avxintrin.h:2185
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition avxintrin.h:602
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition avxintrin.h:3483
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4489
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition avxintrin.h:4183
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2510
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into four signed truncated (rounded toward zero) 32-bit int...
Definition avxintrin.h:2255
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition avxintrin.h:3633
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition avxintrin.h:2235
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition avxintrin.h:2219
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition avxintrin.h:4202
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2484
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition avxintrin.h:288
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3125
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition avxintrin.h:2171
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2876
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2623
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition avxintrin.h:2326
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition avxintrin.h:4759
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into eight signed truncated (rounded toward zero) 32-bit integers re...
Definition avxintrin.h:2295
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4511
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3047
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition avxintrin.h:246
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2708
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition avxintrin.h:4304
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition avxintrin.h:4221
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition avxintrin.h:3862
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2651
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition avxintrin.h:4959
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition avxintrin.h:82
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2927
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:674
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition avxintrin.h:5037
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition avxintrin.h:339
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition avxintrin.h:151
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2766
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition avxintrin.h:170
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2793
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:698
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition avxintrin.h:4164
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition avxintrin.h:4278
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition avxintrin.h:3507
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition avxintrin.h:3459
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition avxintrin.h:4131
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition avxintrin.h:4418
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3142
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition avxintrin.h:3699
static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition avxintrin.h:4435
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4586
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition avxintrin.h:98
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition avxintrin.h:4046
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition avxintrin.h:2392
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition avxintrin.h:2414
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition avxintrin.h:5066
static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition avxintrin.h:4452
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:620
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition avxintrin.h:4384
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2821
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:719
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:656
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:638
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:742
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition avxintrin.h:1429
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition avxintrin.h:4983
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition avxintrin.h:793
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2538
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2850
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition avxintrin.h:4292
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition avxintrin.h:3897
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3434
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition avxintrin.h:322
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition avxintrin.h:3592
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition avxintrin.h:560
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3198
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition avxintrin.h:3316
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition avxintrin.h:267
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4802
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition avxintrin.h:4367
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition avxintrin.h:4316
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4532
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition avxintrin.h:204
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3162
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3003
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition avxintrin.h:3998
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2566
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition avxintrin.h:2458
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition avxintrin.h:2437
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition avxintrin.h:4239
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition avxintrin.h:832
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition avxintrin.h:4257
static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition avxintrin.h:4468
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition avxintrin.h:225
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2736
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3219
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2901
static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition avxintrin.h:2311
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4841
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition avxintrin.h:3257
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition avxintrin.h:3926
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition avxintrin.h:5008
double __v4df __attribute__((__vector_size__(32)))
Definition avxintrin.h:17
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition avxintrin.h:3531
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4822
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3109
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition avxintrin.h:542
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition emmintrin.h:1619
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3878
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition emmintrin.h:3456
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition emmintrin.h:1980
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition emmintrin.h:3909
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:2100
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2021
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1863