clang 22.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
55 __min_vector_width__(256)))
56#define __DEFAULT_FN_ATTRS128 \
57 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
58 __min_vector_width__(128)))
59
60#if defined(__cplusplus) && (__cplusplus >= 201103L)
61#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
62#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
63#else
64#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
65#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
66#endif
67
68/* Arithmetic */
69/// Adds two 256-bit vectors of [4 x double].
70///
71/// \headerfile <x86intrin.h>
72///
73/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
74///
75/// \param __a
76/// A 256-bit vector of [4 x double] containing one of the source operands.
77/// \param __b
78/// A 256-bit vector of [4 x double] containing one of the source operands.
79/// \returns A 256-bit vector of [4 x double] containing the sums of both
80/// operands.
81static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
82_mm256_add_pd(__m256d __a, __m256d __b) {
83 return (__m256d)((__v4df)__a+(__v4df)__b);
84}
85
86/// Adds two 256-bit vectors of [8 x float].
87///
88/// \headerfile <x86intrin.h>
89///
90/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
91///
92/// \param __a
93/// A 256-bit vector of [8 x float] containing one of the source operands.
94/// \param __b
95/// A 256-bit vector of [8 x float] containing one of the source operands.
96/// \returns A 256-bit vector of [8 x float] containing the sums of both
97/// operands.
98static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a,
99 __m256 __b) {
100 return (__m256)((__v8sf)__a+(__v8sf)__b);
101}
102
103/// Subtracts two 256-bit vectors of [4 x double].
104///
105/// \headerfile <x86intrin.h>
106///
107/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
108///
109/// \param __a
110/// A 256-bit vector of [4 x double] containing the minuend.
111/// \param __b
112/// A 256-bit vector of [4 x double] containing the subtrahend.
113/// \returns A 256-bit vector of [4 x double] containing the differences between
114/// both operands.
115static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
116_mm256_sub_pd(__m256d __a, __m256d __b) {
117 return (__m256d)((__v4df)__a-(__v4df)__b);
118}
119
120/// Subtracts two 256-bit vectors of [8 x float].
121///
122/// \headerfile <x86intrin.h>
123///
124/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
125///
126/// \param __a
127/// A 256-bit vector of [8 x float] containing the minuend.
128/// \param __b
129/// A 256-bit vector of [8 x float] containing the subtrahend.
130/// \returns A 256-bit vector of [8 x float] containing the differences between
131/// both operands.
132static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a,
133 __m256 __b) {
134 return (__m256)((__v8sf)__a-(__v8sf)__b);
135}
136
137/// Adds the even-indexed values and subtracts the odd-indexed values of
138/// two 256-bit vectors of [4 x double].
139///
140/// \headerfile <x86intrin.h>
141///
142/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
143///
144/// \param __a
145/// A 256-bit vector of [4 x double] containing the left source operand.
146/// \param __b
147/// A 256-bit vector of [4 x double] containing the right source operand.
148/// \returns A 256-bit vector of [4 x double] containing the alternating sums
149/// and differences between both operands.
150static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
151_mm256_addsub_pd(__m256d __a, __m256d __b) {
152 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
153}
154
155/// Adds the even-indexed values and subtracts the odd-indexed values of
156/// two 256-bit vectors of [8 x float].
157///
158/// \headerfile <x86intrin.h>
159///
160/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
161///
162/// \param __a
163/// A 256-bit vector of [8 x float] containing the left source operand.
164/// \param __b
165/// A 256-bit vector of [8 x float] containing the right source operand.
166/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
167/// differences between both operands.
168static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
169_mm256_addsub_ps(__m256 __a, __m256 __b) {
170 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
171}
172
173/// Divides two 256-bit vectors of [4 x double].
174///
175/// \headerfile <x86intrin.h>
176///
177/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
178///
179/// \param __a
180/// A 256-bit vector of [4 x double] containing the dividend.
181/// \param __b
182/// A 256-bit vector of [4 x double] containing the divisor.
183/// \returns A 256-bit vector of [4 x double] containing the quotients of both
184/// operands.
185static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
186_mm256_div_pd(__m256d __a, __m256d __b) {
187 return (__m256d)((__v4df)__a/(__v4df)__b);
188}
189
190/// Divides two 256-bit vectors of [8 x float].
191///
192/// \headerfile <x86intrin.h>
193///
194/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
195///
196/// \param __a
197/// A 256-bit vector of [8 x float] containing the dividend.
198/// \param __b
199/// A 256-bit vector of [8 x float] containing the divisor.
200/// \returns A 256-bit vector of [8 x float] containing the quotients of both
201/// operands.
202static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a,
203 __m256 __b) {
204 return (__m256)((__v8sf)__a/(__v8sf)__b);
205}
206
207/// Compares two 256-bit vectors of [4 x double] and returns the greater
208/// of each pair of values.
209///
210/// If either value in a comparison is NaN, returns the value from \a __b.
211///
212/// \headerfile <x86intrin.h>
213///
214/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
215///
216/// \param __a
217/// A 256-bit vector of [4 x double] containing one of the operands.
218/// \param __b
219/// A 256-bit vector of [4 x double] containing one of the operands.
220/// \returns A 256-bit vector of [4 x double] containing the maximum values
221/// between both operands.
222static __inline __m256d __DEFAULT_FN_ATTRS
223_mm256_max_pd(__m256d __a, __m256d __b)
224{
225 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
226}
227
228/// Compares two 256-bit vectors of [8 x float] and returns the greater
229/// of each pair of values.
230///
231/// If either value in a comparison is NaN, returns the value from \a __b.
232///
233/// \headerfile <x86intrin.h>
234///
235/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
236///
237/// \param __a
238/// A 256-bit vector of [8 x float] containing one of the operands.
239/// \param __b
240/// A 256-bit vector of [8 x float] containing one of the operands.
241/// \returns A 256-bit vector of [8 x float] containing the maximum values
242/// between both operands.
243static __inline __m256 __DEFAULT_FN_ATTRS
244_mm256_max_ps(__m256 __a, __m256 __b)
245{
246 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
247}
248
249/// Compares two 256-bit vectors of [4 x double] and returns the lesser
250/// of each pair of values.
251///
252/// If either value in a comparison is NaN, returns the value from \a __b.
253///
254/// \headerfile <x86intrin.h>
255///
256/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
257///
258/// \param __a
259/// A 256-bit vector of [4 x double] containing one of the operands.
260/// \param __b
261/// A 256-bit vector of [4 x double] containing one of the operands.
262/// \returns A 256-bit vector of [4 x double] containing the minimum values
263/// between both operands.
264static __inline __m256d __DEFAULT_FN_ATTRS
265_mm256_min_pd(__m256d __a, __m256d __b)
266{
267 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
268}
269
270/// Compares two 256-bit vectors of [8 x float] and returns the lesser
271/// of each pair of values.
272///
273/// If either value in a comparison is NaN, returns the value from \a __b.
274///
275/// \headerfile <x86intrin.h>
276///
277/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
278///
279/// \param __a
280/// A 256-bit vector of [8 x float] containing one of the operands.
281/// \param __b
282/// A 256-bit vector of [8 x float] containing one of the operands.
283/// \returns A 256-bit vector of [8 x float] containing the minimum values
284/// between both operands.
285static __inline __m256 __DEFAULT_FN_ATTRS
286_mm256_min_ps(__m256 __a, __m256 __b)
287{
288 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
289}
290
291/// Multiplies two 256-bit vectors of [4 x double].
292///
293/// \headerfile <x86intrin.h>
294///
295/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
296///
297/// \param __a
298/// A 256-bit vector of [4 x double] containing one of the operands.
299/// \param __b
300/// A 256-bit vector of [4 x double] containing one of the operands.
301/// \returns A 256-bit vector of [4 x double] containing the products of both
302/// operands.
303static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
304_mm256_mul_pd(__m256d __a, __m256d __b) {
305 return (__m256d)((__v4df)__a * (__v4df)__b);
306}
307
308/// Multiplies two 256-bit vectors of [8 x float].
309///
310/// \headerfile <x86intrin.h>
311///
312/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
313///
314/// \param __a
315/// A 256-bit vector of [8 x float] containing one of the operands.
316/// \param __b
317/// A 256-bit vector of [8 x float] containing one of the operands.
318/// \returns A 256-bit vector of [8 x float] containing the products of both
319/// operands.
320static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
321 __m256 __b) {
322 return (__m256)((__v8sf)__a * (__v8sf)__b);
323}
324
325/// Calculates the square roots of the values in a 256-bit vector of
326/// [4 x double].
327///
328/// \headerfile <x86intrin.h>
329///
330/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
331///
332/// \param __a
333/// A 256-bit vector of [4 x double].
334/// \returns A 256-bit vector of [4 x double] containing the square roots of the
335/// values in the operand.
336static __inline __m256d __DEFAULT_FN_ATTRS
338{
339 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
340}
341
342/// Calculates the square roots of the values in a 256-bit vector of
343/// [8 x float].
344///
345/// \headerfile <x86intrin.h>
346///
347/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
348///
349/// \param __a
350/// A 256-bit vector of [8 x float].
351/// \returns A 256-bit vector of [8 x float] containing the square roots of the
352/// values in the operand.
353static __inline __m256 __DEFAULT_FN_ATTRS
355{
356 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
357}
358
359/// Calculates the reciprocal square roots of the values in a 256-bit
360/// vector of [8 x float].
361///
362/// \headerfile <x86intrin.h>
363///
364/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
365///
366/// \param __a
367/// A 256-bit vector of [8 x float].
368/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
369/// roots of the values in the operand.
370static __inline __m256 __DEFAULT_FN_ATTRS
372{
373 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
374}
375
376/// Calculates the reciprocals of the values in a 256-bit vector of
377/// [8 x float].
378///
379/// \headerfile <x86intrin.h>
380///
381/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
382///
383/// \param __a
384/// A 256-bit vector of [8 x float].
385/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
386/// values in the operand.
387static __inline __m256 __DEFAULT_FN_ATTRS
389{
390 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
391}
392
393/// Rounds the values in a 256-bit vector of [4 x double] as specified
394/// by the byte operand. The source values are rounded to integer values and
395/// returned as 64-bit double-precision floating-point values.
396///
397/// \headerfile <x86intrin.h>
398///
399/// \code
400/// __m256d _mm256_round_pd(__m256d V, const int M);
401/// \endcode
402///
403/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
404///
405/// \param V
406/// A 256-bit vector of [4 x double].
407/// \param M
408/// An integer value that specifies the rounding operation. \n
409/// Bits [7:4] are reserved. \n
410/// Bit [3] is a precision exception value: \n
411/// 0: A normal PE exception is used. \n
412/// 1: The PE field is not updated. \n
413/// Bit [2] is the rounding control source: \n
414/// 0: Use bits [1:0] of \a M. \n
415/// 1: Use the current MXCSR setting. \n
416/// Bits [1:0] contain the rounding control definition: \n
417/// 00: Nearest. \n
418/// 01: Downward (toward negative infinity). \n
419/// 10: Upward (toward positive infinity). \n
420/// 11: Truncated.
421/// \returns A 256-bit vector of [4 x double] containing the rounded values.
422#define _mm256_round_pd(V, M) \
423 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
424
425/// Rounds the values stored in a 256-bit vector of [8 x float] as
426/// specified by the byte operand. The source values are rounded to integer
427/// values and returned as floating-point values.
428///
429/// \headerfile <x86intrin.h>
430///
431/// \code
432/// __m256 _mm256_round_ps(__m256 V, const int M);
433/// \endcode
434///
435/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
436///
437/// \param V
438/// A 256-bit vector of [8 x float].
439/// \param M
440/// An integer value that specifies the rounding operation. \n
441/// Bits [7:4] are reserved. \n
442/// Bit [3] is a precision exception value: \n
443/// 0: A normal PE exception is used. \n
444/// 1: The PE field is not updated. \n
445/// Bit [2] is the rounding control source: \n
446/// 0: Use bits [1:0] of \a M. \n
447/// 1: Use the current MXCSR setting. \n
448/// Bits [1:0] contain the rounding control definition: \n
449/// 00: Nearest. \n
450/// 01: Downward (toward negative infinity). \n
451/// 10: Upward (toward positive infinity). \n
452/// 11: Truncated.
453/// \returns A 256-bit vector of [8 x float] containing the rounded values.
454#define _mm256_round_ps(V, M) \
455 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
456
457/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
458/// source values are rounded up to integer values and returned as 64-bit
459/// double-precision floating-point values.
460///
461/// \headerfile <x86intrin.h>
462///
463/// \code
464/// __m256d _mm256_ceil_pd(__m256d V);
465/// \endcode
466///
467/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
468///
469/// \param V
470/// A 256-bit vector of [4 x double].
471/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
472#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
473
474/// Rounds down the values stored in a 256-bit vector of [4 x double].
475/// The source values are rounded down to integer values and returned as
476/// 64-bit double-precision floating-point values.
477///
478/// \headerfile <x86intrin.h>
479///
480/// \code
481/// __m256d _mm256_floor_pd(__m256d V);
482/// \endcode
483///
484/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
485///
486/// \param V
487/// A 256-bit vector of [4 x double].
488/// \returns A 256-bit vector of [4 x double] containing the rounded down
489/// values.
490#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
491
492/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
493/// source values are rounded up to integer values and returned as
494/// floating-point values.
495///
496/// \headerfile <x86intrin.h>
497///
498/// \code
499/// __m256 _mm256_ceil_ps(__m256 V);
500/// \endcode
501///
502/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
503///
504/// \param V
505/// A 256-bit vector of [8 x float].
506/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
507#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
508
509/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
510/// source values are rounded down to integer values and returned as
511/// floating-point values.
512///
513/// \headerfile <x86intrin.h>
514///
515/// \code
516/// __m256 _mm256_floor_ps(__m256 V);
517/// \endcode
518///
519/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
520///
521/// \param V
522/// A 256-bit vector of [8 x float].
523/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
524#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
525
526/* Logical */
527/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
528///
529/// \headerfile <x86intrin.h>
530///
531/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
532///
533/// \param __a
534/// A 256-bit vector of [4 x double] containing one of the source operands.
535/// \param __b
536/// A 256-bit vector of [4 x double] containing one of the source operands.
537/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
538/// values between both operands.
539static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
540_mm256_and_pd(__m256d __a, __m256d __b)
541{
542 return (__m256d)((__v4du)__a & (__v4du)__b);
543}
544
545/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
546///
547/// \headerfile <x86intrin.h>
548///
549/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
550///
551/// \param __a
552/// A 256-bit vector of [8 x float] containing one of the source operands.
553/// \param __b
554/// A 256-bit vector of [8 x float] containing one of the source operands.
555/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
556/// values between both operands.
557static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
558_mm256_and_ps(__m256 __a, __m256 __b)
559{
560 return (__m256)((__v8su)__a & (__v8su)__b);
561}
562
563/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
564/// the one's complement of the values contained in the first source operand.
565///
566/// \headerfile <x86intrin.h>
567///
568/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
569///
570/// \param __a
571/// A 256-bit vector of [4 x double] containing the left source operand. The
572/// one's complement of this value is used in the bitwise AND.
573/// \param __b
574/// A 256-bit vector of [4 x double] containing the right source operand.
575/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
576/// values of the second operand and the one's complement of the first
577/// operand.
578static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
579_mm256_andnot_pd(__m256d __a, __m256d __b)
580{
581 return (__m256d)(~(__v4du)__a & (__v4du)__b);
582}
583
584/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
585/// the one's complement of the values contained in the first source operand.
586///
587/// \headerfile <x86intrin.h>
588///
589/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
590///
591/// \param __a
592/// A 256-bit vector of [8 x float] containing the left source operand. The
593/// one's complement of this value is used in the bitwise AND.
594/// \param __b
595/// A 256-bit vector of [8 x float] containing the right source operand.
596/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
597/// values of the second operand and the one's complement of the first
598/// operand.
599static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
600_mm256_andnot_ps(__m256 __a, __m256 __b)
601{
602 return (__m256)(~(__v8su)__a & (__v8su)__b);
603}
604
605/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
606///
607/// \headerfile <x86intrin.h>
608///
609/// This intrinsic corresponds to the <c> VORPD </c> instruction.
610///
611/// \param __a
612/// A 256-bit vector of [4 x double] containing one of the source operands.
613/// \param __b
614/// A 256-bit vector of [4 x double] containing one of the source operands.
615/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
616/// values between both operands.
617static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
618_mm256_or_pd(__m256d __a, __m256d __b)
619{
620 return (__m256d)((__v4du)__a | (__v4du)__b);
621}
622
623/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
624///
625/// \headerfile <x86intrin.h>
626///
627/// This intrinsic corresponds to the <c> VORPS </c> instruction.
628///
629/// \param __a
630/// A 256-bit vector of [8 x float] containing one of the source operands.
631/// \param __b
632/// A 256-bit vector of [8 x float] containing one of the source operands.
633/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
634/// values between both operands.
635static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
636_mm256_or_ps(__m256 __a, __m256 __b)
637{
638 return (__m256)((__v8su)__a | (__v8su)__b);
639}
640
641/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
642///
643/// \headerfile <x86intrin.h>
644///
645/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
646///
647/// \param __a
648/// A 256-bit vector of [4 x double] containing one of the source operands.
649/// \param __b
650/// A 256-bit vector of [4 x double] containing one of the source operands.
651/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
652/// values between both operands.
653static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
654_mm256_xor_pd(__m256d __a, __m256d __b)
655{
656 return (__m256d)((__v4du)__a ^ (__v4du)__b);
657}
658
659/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
660///
661/// \headerfile <x86intrin.h>
662///
663/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
664///
665/// \param __a
666/// A 256-bit vector of [8 x float] containing one of the source operands.
667/// \param __b
668/// A 256-bit vector of [8 x float] containing one of the source operands.
669/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
670/// values between both operands.
671static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
672_mm256_xor_ps(__m256 __a, __m256 __b)
673{
674 return (__m256)((__v8su)__a ^ (__v8su)__b);
675}
676
677/* Horizontal arithmetic */
678/// Horizontally adds the adjacent pairs of values contained in two
679/// 256-bit vectors of [4 x double].
680///
681/// \headerfile <x86intrin.h>
682///
683/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
684///
685/// \param __a
686/// A 256-bit vector of [4 x double] containing one of the source operands.
687/// The horizontal sums of the values are returned in the even-indexed
688/// elements of a vector of [4 x double].
689/// \param __b
690/// A 256-bit vector of [4 x double] containing one of the source operands.
691/// The horizontal sums of the values are returned in the odd-indexed
692/// elements of a vector of [4 x double].
693/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
694/// both operands.
695static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
696_mm256_hadd_pd(__m256d __a, __m256d __b) {
697 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
698}
699
700/// Horizontally adds the adjacent pairs of values contained in two
701/// 256-bit vectors of [8 x float].
702///
703/// \headerfile <x86intrin.h>
704///
705/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
706///
707/// \param __a
708/// A 256-bit vector of [8 x float] containing one of the source operands.
709/// The horizontal sums of the values are returned in the elements with
710/// index 0, 1, 4, 5 of a vector of [8 x float].
711/// \param __b
712/// A 256-bit vector of [8 x float] containing one of the source operands.
713/// The horizontal sums of the values are returned in the elements with
714/// index 2, 3, 6, 7 of a vector of [8 x float].
715/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
716/// both operands.
717static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a,
718 __m256 __b) {
719 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
720}
721
722/// Horizontally subtracts the adjacent pairs of values contained in two
723/// 256-bit vectors of [4 x double].
724///
725/// \headerfile <x86intrin.h>
726///
727/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
728///
729/// \param __a
730/// A 256-bit vector of [4 x double] containing one of the source operands.
731/// The horizontal differences between the values are returned in the
732/// even-indexed elements of a vector of [4 x double].
733/// \param __b
734/// A 256-bit vector of [4 x double] containing one of the source operands.
735/// The horizontal differences between the values are returned in the
736/// odd-indexed elements of a vector of [4 x double].
737/// \returns A 256-bit vector of [4 x double] containing the horizontal
738/// differences of both operands.
739static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
740_mm256_hsub_pd(__m256d __a, __m256d __b) {
741 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
742}
743
744/// Horizontally subtracts the adjacent pairs of values contained in two
745/// 256-bit vectors of [8 x float].
746///
747/// \headerfile <x86intrin.h>
748///
749/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
750///
751/// \param __a
752/// A 256-bit vector of [8 x float] containing one of the source operands.
753/// The horizontal differences between the values are returned in the
754/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
755/// \param __b
756/// A 256-bit vector of [8 x float] containing one of the source operands.
757/// The horizontal differences between the values are returned in the
758/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
759/// \returns A 256-bit vector of [8 x float] containing the horizontal
760/// differences of both operands.
761static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a,
762 __m256 __b) {
763 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
764}
765
766/* Vector permutations */
767/// Copies the values in a 128-bit vector of [2 x double] as specified
768/// by the 128-bit integer vector operand.
769///
770/// \headerfile <x86intrin.h>
771///
772/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
773///
774/// \param __a
775/// A 128-bit vector of [2 x double].
776/// \param __c
777/// A 128-bit integer vector operand specifying how the values are to be
778/// copied. \n
779/// Bit [1]: \n
780/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
781/// vector. \n
782/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
783/// returned vector. \n
784/// Bit [65]: \n
785/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
786/// returned vector. \n
787/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
788/// returned vector.
789/// \returns A 128-bit vector of [2 x double] containing the copied values.
790static __inline __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
791_mm_permutevar_pd(__m128d __a, __m128i __c) {
792 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
793}
794
795/// Copies the values in a 256-bit vector of [4 x double] as specified
796/// by the 256-bit integer vector operand.
797///
798/// \headerfile <x86intrin.h>
799///
800/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
801///
802/// \param __a
803/// A 256-bit vector of [4 x double].
804/// \param __c
805/// A 256-bit integer vector operand specifying how the values are to be
806/// copied. \n
807/// Bit [1]: \n
808/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
809/// vector. \n
810/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
811/// returned vector. \n
812/// Bit [65]: \n
813/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
814/// returned vector. \n
815/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
816/// returned vector. \n
817/// Bit [129]: \n
818/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
819/// returned vector. \n
820/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
821/// returned vector. \n
822/// Bit [193]: \n
823/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
824/// returned vector. \n
825/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
826/// returned vector.
827/// \returns A 256-bit vector of [4 x double] containing the copied values.
828static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
829_mm256_permutevar_pd(__m256d __a, __m256i __c) {
830 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
831}
832
833/// Copies the values stored in a 128-bit vector of [4 x float] as
834/// specified by the 128-bit integer vector operand.
835///
836/// \headerfile <x86intrin.h>
837///
838/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
839///
840/// \param __a
841/// A 128-bit vector of [4 x float].
842/// \param __c
843/// A 128-bit integer vector operand specifying how the values are to be
844/// copied. \n
845/// Bits [1:0]: \n
846/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
847/// returned vector. \n
848/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
849/// returned vector. \n
850/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
851/// returned vector. \n
852/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
853/// returned vector. \n
854/// Bits [33:32]: \n
855/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
856/// returned vector. \n
857/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
858/// returned vector. \n
859/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
860/// returned vector. \n
861/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
862/// returned vector. \n
863/// Bits [65:64]: \n
864/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
865/// returned vector. \n
866/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
867/// returned vector. \n
868/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
869/// returned vector. \n
870/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
871/// returned vector. \n
872/// Bits [97:96]: \n
873/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
874/// returned vector. \n
875/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
876/// returned vector. \n
877/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
878/// returned vector. \n
879/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
880/// returned vector.
881/// \returns A 128-bit vector of [4 x float] containing the copied values.
882static __inline __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
883_mm_permutevar_ps(__m128 __a, __m128i __c) {
884 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
885}
886
887/// Copies the values stored in a 256-bit vector of [8 x float] as
888/// specified by the 256-bit integer vector operand.
889///
890/// \headerfile <x86intrin.h>
891///
892/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
893///
894/// \param __a
895/// A 256-bit vector of [8 x float].
896/// \param __c
897/// A 256-bit integer vector operand specifying how the values are to be
898/// copied. \n
899/// Bits [1:0]: \n
900/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
901/// returned vector. \n
902/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
903/// returned vector. \n
904/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
905/// returned vector. \n
906/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
907/// returned vector. \n
908/// Bits [33:32]: \n
909/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
910/// returned vector. \n
911/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
912/// returned vector. \n
913/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
914/// returned vector. \n
915/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
916/// returned vector. \n
917/// Bits [65:64]: \n
918/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
919/// returned vector. \n
920/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
921/// returned vector. \n
922/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
923/// returned vector. \n
924/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
925/// returned vector. \n
926/// Bits [97:96]: \n
927/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
928/// returned vector. \n
929/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
930/// returned vector. \n
931/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
932/// returned vector. \n
933/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
934/// returned vector. \n
935/// Bits [129:128]: \n
936/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
937/// returned vector. \n
938/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
939/// returned vector. \n
940/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
941/// returned vector. \n
942/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
943/// returned vector. \n
944/// Bits [161:160]: \n
945/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
946/// returned vector. \n
947/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
948/// returned vector. \n
949/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
950/// returned vector. \n
951/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
952/// returned vector. \n
953/// Bits [193:192]: \n
954/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
955/// returned vector. \n
956/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
957/// returned vector. \n
958/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
959/// returned vector. \n
960/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
961/// returned vector. \n
962/// Bits [225:224]: \n
963/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
964/// returned vector. \n
965/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
966/// returned vector. \n
967/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
968/// returned vector. \n
969/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
970/// returned vector.
971/// \returns A 256-bit vector of [8 x float] containing the copied values.
972static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
973_mm256_permutevar_ps(__m256 __a, __m256i __c) {
974 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
975}
976
977/// Copies the values in a 128-bit vector of [2 x double] as specified
978/// by the immediate integer operand.
979///
980/// \headerfile <x86intrin.h>
981///
982/// \code
983/// __m128d _mm_permute_pd(__m128d A, const int C);
984/// \endcode
985///
986/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
987///
988/// \param A
989/// A 128-bit vector of [2 x double].
990/// \param C
991/// An immediate integer operand specifying how the values are to be
992/// copied. \n
993/// Bit [0]: \n
994/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
995/// vector. \n
996/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
997/// returned vector. \n
998/// Bit [1]: \n
999/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1000/// returned vector. \n
1001/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1002/// returned vector.
1003/// \returns A 128-bit vector of [2 x double] containing the copied values.
1004#define _mm_permute_pd(A, C) \
1005 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1006
1007/// Copies the values in a 256-bit vector of [4 x double] as specified by
1008/// the immediate integer operand.
1009///
1010/// \headerfile <x86intrin.h>
1011///
1012/// \code
1013/// __m256d _mm256_permute_pd(__m256d A, const int C);
1014/// \endcode
1015///
1016/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1017///
1018/// \param A
1019/// A 256-bit vector of [4 x double].
1020/// \param C
1021/// An immediate integer operand specifying how the values are to be
1022/// copied. \n
1023/// Bit [0]: \n
1024/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1025/// vector. \n
1026/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1027/// returned vector. \n
1028/// Bit [1]: \n
1029/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1030/// returned vector. \n
1031/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1032/// returned vector. \n
1033/// Bit [2]: \n
1034/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1035/// returned vector. \n
1036/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1037/// returned vector. \n
1038/// Bit [3]: \n
1039/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1040/// returned vector. \n
1041/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1042/// returned vector.
1043/// \returns A 256-bit vector of [4 x double] containing the copied values.
1044#define _mm256_permute_pd(A, C) \
1045 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1046
1047/// Copies the values in a 128-bit vector of [4 x float] as specified by
1048/// the immediate integer operand.
1049///
1050/// \headerfile <x86intrin.h>
1051///
1052/// \code
1053/// __m128 _mm_permute_ps(__m128 A, const int C);
1054/// \endcode
1055///
1056/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1057///
1058/// \param A
1059/// A 128-bit vector of [4 x float].
1060/// \param C
1061/// An immediate integer operand specifying how the values are to be
1062/// copied. \n
1063/// Bits [1:0]: \n
1064/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1065/// returned vector. \n
1066/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1067/// returned vector. \n
1068/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1069/// returned vector. \n
1070/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1071/// returned vector. \n
1072/// Bits [3:2]: \n
1073/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1074/// returned vector. \n
1075/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1076/// returned vector. \n
1077/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1078/// returned vector. \n
1079/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1080/// returned vector. \n
1081/// Bits [5:4]: \n
1082/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1083/// returned vector. \n
1084/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1085/// returned vector. \n
1086/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1087/// returned vector. \n
1088/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1089/// returned vector. \n
1090/// Bits [7:6]: \n
1091/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1092/// returned vector. \n
1093/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1094/// returned vector. \n
1095/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1096/// returned vector. \n
1097/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1098/// returned vector.
1099/// \returns A 128-bit vector of [4 x float] containing the copied values.
1100#define _mm_permute_ps(A, C) \
1101 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1102
1103/// Copies the values in a 256-bit vector of [8 x float] as specified by
1104/// the immediate integer operand.
1105///
1106/// \headerfile <x86intrin.h>
1107///
1108/// \code
1109/// __m256 _mm256_permute_ps(__m256 A, const int C);
1110/// \endcode
1111///
1112/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1113///
1114/// \param A
1115/// A 256-bit vector of [8 x float].
1116/// \param C
1117/// An immediate integer operand specifying how the values are to be
1118/// copied. \n
1119/// Bits [1:0]: \n
1120/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1121/// returned vector. \n
1122/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1123/// returned vector. \n
1124/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1125/// returned vector. \n
1126/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1127/// returned vector. \n
1128/// Bits [3:2]: \n
1129/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1130/// returned vector. \n
1131/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1132/// returned vector. \n
1133/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1134/// returned vector. \n
1135/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1136/// returned vector. \n
1137/// Bits [5:4]: \n
1138/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1139/// returned vector. \n
1140/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1141/// returned vector. \n
1142/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1143/// returned vector. \n
1144/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1145/// returned vector. \n
1146/// Bits [7:6]: \n
1147/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1148/// returned vector. \n
1149/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1150/// returned vector. \n
1151/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1152/// returned vector. \n
1153/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1154/// returned vector. \n
1155/// Bits [1:0]: \n
1156/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1157/// returned vector. \n
1158/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1159/// returned vector. \n
1160/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1161/// returned vector. \n
1162/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1163/// returned vector. \n
1164/// Bits [3:2]: \n
1165/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1166/// returned vector. \n
1167/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1168/// returned vector. \n
1169/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1170/// returned vector. \n
1171/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1172/// returned vector. \n
1173/// Bits [5:4]: \n
1174/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1175/// returned vector. \n
1176/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1177/// returned vector. \n
1178/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1179/// returned vector. \n
1180/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1181/// returned vector. \n
1182/// Bits [7:6]: \n
1183/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1184/// returned vector. \n
1185/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1186/// returned vector. \n
1187/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1188/// returned vector. \n
1189/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1190/// returned vector.
1191/// \returns A 256-bit vector of [8 x float] containing the copied values.
1192#define _mm256_permute_ps(A, C) \
1193 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1194
1195/// Permutes 128-bit data values stored in two 256-bit vectors of
1196/// [4 x double], as specified by the immediate integer operand.
1197///
1198/// \headerfile <x86intrin.h>
1199///
1200/// \code
1201/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1202/// \endcode
1203///
1204/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1205///
1206/// \param V1
1207/// A 256-bit vector of [4 x double].
1208/// \param V2
1209/// A 256-bit vector of [4 x double.
1210/// \param M
1211/// An immediate integer operand specifying how the values are to be
1212/// permuted. \n
1213/// Bits [1:0]: \n
1214/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1215/// destination. \n
1216/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1217/// destination. \n
1218/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1219/// destination. \n
1220/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1221/// destination. \n
1222/// Bits [5:4]: \n
1223/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1224/// destination. \n
1225/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1226/// destination. \n
1227/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1228/// destination. \n
1229/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1230/// destination.
1231/// \returns A 256-bit vector of [4 x double] containing the copied values.
1232#define _mm256_permute2f128_pd(V1, V2, M) \
1233 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1234 (__v4df)(__m256d)(V2), (int)(M)))
1235
1236/// Permutes 128-bit data values stored in two 256-bit vectors of
1237/// [8 x float], as specified by the immediate integer operand.
1238///
1239/// \headerfile <x86intrin.h>
1240///
1241/// \code
1242/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1243/// \endcode
1244///
1245/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1246///
1247/// \param V1
1248/// A 256-bit vector of [8 x float].
1249/// \param V2
1250/// A 256-bit vector of [8 x float].
1251/// \param M
1252/// An immediate integer operand specifying how the values are to be
1253/// permuted. \n
1254/// Bits [1:0]: \n
1255/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1256/// destination. \n
1257/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1258/// destination. \n
1259/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1260/// destination. \n
1261/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1262/// destination. \n
1263/// Bits [5:4]: \n
1264/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1265/// destination. \n
1266/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1267/// destination. \n
1268/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1269/// destination. \n
1270/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1271/// destination.
1272/// \returns A 256-bit vector of [8 x float] containing the copied values.
1273#define _mm256_permute2f128_ps(V1, V2, M) \
1274 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1275 (__v8sf)(__m256)(V2), (int)(M)))
1276
1277/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1278/// as specified by the immediate integer operand.
1279///
1280/// \headerfile <x86intrin.h>
1281///
1282/// \code
1283/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1284/// \endcode
1285///
1286/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1287///
1288/// \param V1
1289/// A 256-bit integer vector.
1290/// \param V2
1291/// A 256-bit integer vector.
1292/// \param M
1293/// An immediate integer operand specifying how the values are to be copied.
1294/// Bits [1:0]: \n
1295/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1296/// destination. \n
1297/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1298/// destination. \n
1299/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1300/// destination. \n
1301/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1302/// destination. \n
1303/// Bits [5:4]: \n
1304/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1305/// destination. \n
1306/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1307/// destination. \n
1308/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1309/// destination. \n
1310/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1311/// destination.
1312/// \returns A 256-bit integer vector containing the copied values.
1313#define _mm256_permute2f128_si256(V1, V2, M) \
1314 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1315 (__v8si)(__m256i)(V2), (int)(M)))
1316
1317/* Vector Blend */
1318/// Merges 64-bit double-precision data values stored in either of the
1319/// two 256-bit vectors of [4 x double], as specified by the immediate
1320/// integer operand.
1321///
1322/// \headerfile <x86intrin.h>
1323///
1324/// \code
1325/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1326/// \endcode
1327///
1328/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1329///
1330/// \param V1
1331/// A 256-bit vector of [4 x double].
1332/// \param V2
1333/// A 256-bit vector of [4 x double].
1334/// \param M
1335/// An immediate integer operand, with mask bits [3:0] specifying how the
1336/// values are to be copied. The position of the mask bit corresponds to the
1337/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1338/// element in operand \a V1 is copied to the same position in the
1339/// destination. When a mask bit is 1, the corresponding 64-bit element in
1340/// operand \a V2 is copied to the same position in the destination.
1341/// \returns A 256-bit vector of [4 x double] containing the copied values.
1342#define _mm256_blend_pd(V1, V2, M) \
1343 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1344 (__v4df)(__m256d)(V2), (int)(M)))
1345
1346/// Merges 32-bit single-precision data values stored in either of the
1347/// two 256-bit vectors of [8 x float], as specified by the immediate
1348/// integer operand.
1349///
1350/// \headerfile <x86intrin.h>
1351///
1352/// \code
1353/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1354/// \endcode
1355///
1356/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1357///
1358/// \param V1
1359/// A 256-bit vector of [8 x float].
1360/// \param V2
1361/// A 256-bit vector of [8 x float].
1362/// \param M
1363/// An immediate integer operand, with mask bits [7:0] specifying how the
1364/// values are to be copied. The position of the mask bit corresponds to the
1365/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1366/// element in operand \a V1 is copied to the same position in the
1367/// destination. When a mask bit is 1, the corresponding 32-bit element in
1368/// operand \a V2 is copied to the same position in the destination.
1369/// \returns A 256-bit vector of [8 x float] containing the copied values.
1370#define _mm256_blend_ps(V1, V2, M) \
1371 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1372 (__v8sf)(__m256)(V2), (int)(M)))
1373
1374/// Merges 64-bit double-precision data values stored in either of the
1375/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1376/// operand.
1377///
1378/// \headerfile <x86intrin.h>
1379///
1380/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1381///
1382/// \param __a
1383/// A 256-bit vector of [4 x double].
1384/// \param __b
1385/// A 256-bit vector of [4 x double].
1386/// \param __c
1387/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1388/// how the values are to be copied. The position of the mask bit corresponds
1389/// to the most significant bit of a copied value. When a mask bit is 0, the
1390/// corresponding 64-bit element in operand \a __a is copied to the same
1391/// position in the destination. When a mask bit is 1, the corresponding
1392/// 64-bit element in operand \a __b is copied to the same position in the
1393/// destination.
1394/// \returns A 256-bit vector of [4 x double] containing the copied values.
1395static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
1396_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {
1397 return (__m256d)__builtin_ia32_blendvpd256(
1398 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1399}
1400
1401/// Merges 32-bit single-precision data values stored in either of the
1402/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1403/// operand.
1404///
1405/// \headerfile <x86intrin.h>
1406///
1407/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1408///
1409/// \param __a
1410/// A 256-bit vector of [8 x float].
1411/// \param __b
1412/// A 256-bit vector of [8 x float].
1413/// \param __c
1414/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1415/// and 31 specifying how the values are to be copied. The position of the
1416/// mask bit corresponds to the most significant bit of a copied value. When
1417/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1418/// copied to the same position in the destination. When a mask bit is 1, the
1419/// corresponding 32-bit element in operand \a __b is copied to the same
1420/// position in the destination.
1421/// \returns A 256-bit vector of [8 x float] containing the copied values.
1422static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
1423_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {
1424 return (__m256)__builtin_ia32_blendvps256(
1425 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1426}
1427
1428/* Vector Dot Product */
1429/// Computes two dot products in parallel, using the lower and upper
1430/// halves of two [8 x float] vectors as input to the two computations, and
1431/// returning the two dot products in the lower and upper halves of the
1432/// [8 x float] result.
1433///
1434/// The immediate integer operand controls which input elements will
1435/// contribute to the dot product, and where the final results are returned.
1436/// In general, for each dot product, the four corresponding elements of the
1437/// input vectors are multiplied; the first two and second two products are
1438/// summed, then the two sums are added to form the final result.
1439///
1440/// \headerfile <x86intrin.h>
1441///
1442/// \code
1443/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1444/// \endcode
1445///
1446/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1447///
1448/// \param V1
1449/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1450/// \param V2
1451/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1452/// \param M
1453/// An immediate integer argument. Bits [7:4] determine which elements of
1454/// the input vectors are used, with bit [4] corresponding to the lowest
1455/// element and bit [7] corresponding to the highest element of each [4 x
1456/// float] subvector. If a bit is set, the corresponding elements from the
1457/// two input vectors are used as an input for dot product; otherwise that
1458/// input is treated as zero. Bits [3:0] determine which elements of the
1459/// result will receive a copy of the final dot product, with bit [0]
1460/// corresponding to the lowest element and bit [3] corresponding to the
1461/// highest element of each [4 x float] subvector. If a bit is set, the dot
1462/// product is returned in the corresponding element; otherwise that element
1463/// is set to zero. The bitmask is applied in the same way to each of the
1464/// two parallel dot product computations.
1465/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1466#define _mm256_dp_ps(V1, V2, M) \
1467 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1468 (__v8sf)(__m256)(V2), (M)))
1469
1470/* Vector shuffle */
1471/// Selects 8 float values from the 256-bit operands of [8 x float], as
1472/// specified by the immediate value operand.
1473///
1474/// The four selected elements in each operand are copied to the destination
1475/// according to the bits specified in the immediate operand. The selected
1476/// elements from the first 256-bit operand are copied to bits [63:0] and
1477/// bits [191:128] of the destination, and the selected elements from the
1478/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1479/// the destination. For example, if bits [7:0] of the immediate operand
1480/// contain a value of 0xFF, the 256-bit destination vector would contain the
1481/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1482///
1483/// \headerfile <x86intrin.h>
1484///
1485/// \code
1486/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1487/// \endcode
1488///
1489/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1490///
1491/// \param a
1492/// A 256-bit vector of [8 x float]. The four selected elements in this
1493/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1494/// according to the bits specified in the immediate operand.
1495/// \param b
1496/// A 256-bit vector of [8 x float]. The four selected elements in this
1497/// operand are copied to bits [127:64] and bits [255:192] in the
1498/// destination, according to the bits specified in the immediate operand.
1499/// \param mask
1500/// An immediate value containing an 8-bit value specifying which elements to
1501/// copy from \a a and \a b \n.
1502/// Bits [3:0] specify the values copied from operand \a a. \n
1503/// Bits [7:4] specify the values copied from operand \a b. \n
1504/// The destinations within the 256-bit destination are assigned values as
1505/// follows, according to the bit value assignments described below: \n
1506/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1507/// destination. \n
1508/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1509/// destination. \n
1510/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1511/// destination. \n
1512/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1513/// the destination. \n
1514/// Bit value assignments: \n
1515/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1516/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1517/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1518/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1519/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1520/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1521/// <c>[b6, b4, b2, b0]</c>.
1522/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1523#define _mm256_shuffle_ps(a, b, mask) \
1524 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1525 (__v8sf)(__m256)(b), (int)(mask)))
1526
1527/// Selects four double-precision values from the 256-bit operands of
1528/// [4 x double], as specified by the immediate value operand.
1529///
1530/// The selected elements from the first 256-bit operand are copied to bits
1531/// [63:0] and bits [191:128] in the destination, and the selected elements
1532/// from the second 256-bit operand are copied to bits [127:64] and bits
1533/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1534/// operand contain a value of 0xF, the 256-bit destination vector would
1535/// contain the following values: b[3], a[3], b[1], a[1].
1536///
1537/// \headerfile <x86intrin.h>
1538///
1539/// \code
1540/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1541/// \endcode
1542///
1543/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1544///
1545/// \param a
1546/// A 256-bit vector of [4 x double].
1547/// \param b
1548/// A 256-bit vector of [4 x double].
1549/// \param mask
1550/// An immediate value containing 8-bit values specifying which elements to
1551/// copy from \a a and \a b: \n
1552/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1553/// destination. \n
1554/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1555/// destination. \n
1556/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1557/// destination. \n
1558/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1559/// destination. \n
1560/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1561/// destination. \n
1562/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1563/// destination. \n
1564/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1565/// destination. \n
1566/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1567/// destination.
1568/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1569#define _mm256_shuffle_pd(a, b, mask) \
1570 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1571 (__v4df)(__m256d)(b), (int)(mask)))
1572
1573/* Compare */
1574#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1575#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1576#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1577#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1578#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1579#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1580#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1581#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1582#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1583#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1584#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1585#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1586#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1587#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1588#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1589#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1590#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1591#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1592#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1593#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1594#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1595#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1596#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1597#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1598
1599/* Below intrinsic defined in emmintrin.h can be used for AVX */
1600/// Compares each of the corresponding double-precision values of two
1601/// 128-bit vectors of [2 x double], using the operation specified by the
1602/// immediate integer operand.
1603///
1604/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1605/// If either value in a comparison is NaN, comparisons that are ordered
1606/// return false, and comparisons that are unordered return true.
1607///
1608/// \headerfile <x86intrin.h>
1609///
1610/// \code
1611/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1612/// \endcode
1613///
1614/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1615///
1616/// \param a
1617/// A 128-bit vector of [2 x double].
1618/// \param b
1619/// A 128-bit vector of [2 x double].
1620/// \param c
1621/// An immediate integer operand, with bits [4:0] specifying which comparison
1622/// operation to use: \n
1623/// 0x00: Equal (ordered, non-signaling) \n
1624/// 0x01: Less-than (ordered, signaling) \n
1625/// 0x02: Less-than-or-equal (ordered, signaling) \n
1626/// 0x03: Unordered (non-signaling) \n
1627/// 0x04: Not-equal (unordered, non-signaling) \n
1628/// 0x05: Not-less-than (unordered, signaling) \n
1629/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1630/// 0x07: Ordered (non-signaling) \n
1631/// 0x08: Equal (unordered, non-signaling) \n
1632/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1633/// 0x0A: Not-greater-than (unordered, signaling) \n
1634/// 0x0B: False (ordered, non-signaling) \n
1635/// 0x0C: Not-equal (ordered, non-signaling) \n
1636/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1637/// 0x0E: Greater-than (ordered, signaling) \n
1638/// 0x0F: True (unordered, non-signaling) \n
1639/// 0x10: Equal (ordered, signaling) \n
1640/// 0x11: Less-than (ordered, non-signaling) \n
1641/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1642/// 0x13: Unordered (signaling) \n
1643/// 0x14: Not-equal (unordered, signaling) \n
1644/// 0x15: Not-less-than (unordered, non-signaling) \n
1645/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1646/// 0x17: Ordered (signaling) \n
1647/// 0x18: Equal (unordered, signaling) \n
1648/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1649/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1650/// 0x1B: False (ordered, signaling) \n
1651/// 0x1C: Not-equal (ordered, signaling) \n
1652/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1653/// 0x1E: Greater-than (ordered, non-signaling) \n
1654/// 0x1F: True (unordered, signaling)
1655/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1656/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1657
1658/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1659/// Compares each of the corresponding values of two 128-bit vectors of
1660/// [4 x float], using the operation specified by the immediate integer
1661/// operand.
1662///
1663/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1664/// If either value in a comparison is NaN, comparisons that are ordered
1665/// return false, and comparisons that are unordered return true.
1666///
1667/// \headerfile <x86intrin.h>
1668///
1669/// \code
1670/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1671/// \endcode
1672///
1673/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1674///
1675/// \param a
1676/// A 128-bit vector of [4 x float].
1677/// \param b
1678/// A 128-bit vector of [4 x float].
1679/// \param c
1680/// An immediate integer operand, with bits [4:0] specifying which comparison
1681/// operation to use: \n
1682/// 0x00: Equal (ordered, non-signaling) \n
1683/// 0x01: Less-than (ordered, signaling) \n
1684/// 0x02: Less-than-or-equal (ordered, signaling) \n
1685/// 0x03: Unordered (non-signaling) \n
1686/// 0x04: Not-equal (unordered, non-signaling) \n
1687/// 0x05: Not-less-than (unordered, signaling) \n
1688/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1689/// 0x07: Ordered (non-signaling) \n
1690/// 0x08: Equal (unordered, non-signaling) \n
1691/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1692/// 0x0A: Not-greater-than (unordered, signaling) \n
1693/// 0x0B: False (ordered, non-signaling) \n
1694/// 0x0C: Not-equal (ordered, non-signaling) \n
1695/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1696/// 0x0E: Greater-than (ordered, signaling) \n
1697/// 0x0F: True (unordered, non-signaling) \n
1698/// 0x10: Equal (ordered, signaling) \n
1699/// 0x11: Less-than (ordered, non-signaling) \n
1700/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1701/// 0x13: Unordered (signaling) \n
1702/// 0x14: Not-equal (unordered, signaling) \n
1703/// 0x15: Not-less-than (unordered, non-signaling) \n
1704/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1705/// 0x17: Ordered (signaling) \n
1706/// 0x18: Equal (unordered, signaling) \n
1707/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1708/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1709/// 0x1B: False (ordered, signaling) \n
1710/// 0x1C: Not-equal (ordered, signaling) \n
1711/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1712/// 0x1E: Greater-than (ordered, non-signaling) \n
1713/// 0x1F: True (unordered, signaling)
1714/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1715/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1716
1717/// Compares each of the corresponding double-precision values of two
1718/// 256-bit vectors of [4 x double], using the operation specified by the
1719/// immediate integer operand.
1720///
1721/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1722/// If either value in a comparison is NaN, comparisons that are ordered
1723/// return false, and comparisons that are unordered return true.
1724///
1725/// \headerfile <x86intrin.h>
1726///
1727/// \code
1728/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1729/// \endcode
1730///
1731/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1732///
1733/// \param a
1734/// A 256-bit vector of [4 x double].
1735/// \param b
1736/// A 256-bit vector of [4 x double].
1737/// \param c
1738/// An immediate integer operand, with bits [4:0] specifying which comparison
1739/// operation to use: \n
1740/// 0x00: Equal (ordered, non-signaling) \n
1741/// 0x01: Less-than (ordered, signaling) \n
1742/// 0x02: Less-than-or-equal (ordered, signaling) \n
1743/// 0x03: Unordered (non-signaling) \n
1744/// 0x04: Not-equal (unordered, non-signaling) \n
1745/// 0x05: Not-less-than (unordered, signaling) \n
1746/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1747/// 0x07: Ordered (non-signaling) \n
1748/// 0x08: Equal (unordered, non-signaling) \n
1749/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1750/// 0x0A: Not-greater-than (unordered, signaling) \n
1751/// 0x0B: False (ordered, non-signaling) \n
1752/// 0x0C: Not-equal (ordered, non-signaling) \n
1753/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1754/// 0x0E: Greater-than (ordered, signaling) \n
1755/// 0x0F: True (unordered, non-signaling) \n
1756/// 0x10: Equal (ordered, signaling) \n
1757/// 0x11: Less-than (ordered, non-signaling) \n
1758/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1759/// 0x13: Unordered (signaling) \n
1760/// 0x14: Not-equal (unordered, signaling) \n
1761/// 0x15: Not-less-than (unordered, non-signaling) \n
1762/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1763/// 0x17: Ordered (signaling) \n
1764/// 0x18: Equal (unordered, signaling) \n
1765/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1766/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1767/// 0x1B: False (ordered, signaling) \n
1768/// 0x1C: Not-equal (ordered, signaling) \n
1769/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1770/// 0x1E: Greater-than (ordered, non-signaling) \n
1771/// 0x1F: True (unordered, signaling)
1772/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1773#define _mm256_cmp_pd(a, b, c) \
1774 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1775 (__v4df)(__m256d)(b), (c)))
1776
1777/// Compares each of the corresponding values of two 256-bit vectors of
1778/// [8 x float], using the operation specified by the immediate integer
1779/// operand.
1780///
1781/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1782/// If either value in a comparison is NaN, comparisons that are ordered
1783/// return false, and comparisons that are unordered return true.
1784///
1785/// \headerfile <x86intrin.h>
1786///
1787/// \code
1788/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1789/// \endcode
1790///
1791/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1792///
1793/// \param a
1794/// A 256-bit vector of [8 x float].
1795/// \param b
1796/// A 256-bit vector of [8 x float].
1797/// \param c
1798/// An immediate integer operand, with bits [4:0] specifying which comparison
1799/// operation to use: \n
1800/// 0x00: Equal (ordered, non-signaling) \n
1801/// 0x01: Less-than (ordered, signaling) \n
1802/// 0x02: Less-than-or-equal (ordered, signaling) \n
1803/// 0x03: Unordered (non-signaling) \n
1804/// 0x04: Not-equal (unordered, non-signaling) \n
1805/// 0x05: Not-less-than (unordered, signaling) \n
1806/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1807/// 0x07: Ordered (non-signaling) \n
1808/// 0x08: Equal (unordered, non-signaling) \n
1809/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1810/// 0x0A: Not-greater-than (unordered, signaling) \n
1811/// 0x0B: False (ordered, non-signaling) \n
1812/// 0x0C: Not-equal (ordered, non-signaling) \n
1813/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1814/// 0x0E: Greater-than (ordered, signaling) \n
1815/// 0x0F: True (unordered, non-signaling) \n
1816/// 0x10: Equal (ordered, signaling) \n
1817/// 0x11: Less-than (ordered, non-signaling) \n
1818/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1819/// 0x13: Unordered (signaling) \n
1820/// 0x14: Not-equal (unordered, signaling) \n
1821/// 0x15: Not-less-than (unordered, non-signaling) \n
1822/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1823/// 0x17: Ordered (signaling) \n
1824/// 0x18: Equal (unordered, signaling) \n
1825/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1826/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1827/// 0x1B: False (ordered, signaling) \n
1828/// 0x1C: Not-equal (ordered, signaling) \n
1829/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1830/// 0x1E: Greater-than (ordered, non-signaling) \n
1831/// 0x1F: True (unordered, signaling)
1832/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1833#define _mm256_cmp_ps(a, b, c) \
1834 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1835 (__v8sf)(__m256)(b), (c)))
1836
1837/* Below intrinsic defined in emmintrin.h can be used for AVX */
1838/// Compares each of the corresponding scalar double-precision values of
1839/// two 128-bit vectors of [2 x double], using the operation specified by the
1840/// immediate integer operand.
1841///
1842/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1843/// If either value in a comparison is NaN, comparisons that are ordered
1844/// return false, and comparisons that are unordered return true.
1845///
1846/// \headerfile <x86intrin.h>
1847///
1848/// \code
1849/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1850/// \endcode
1851///
1852/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1853///
1854/// \param a
1855/// A 128-bit vector of [2 x double].
1856/// \param b
1857/// A 128-bit vector of [2 x double].
1858/// \param c
1859/// An immediate integer operand, with bits [4:0] specifying which comparison
1860/// operation to use: \n
1861/// 0x00: Equal (ordered, non-signaling) \n
1862/// 0x01: Less-than (ordered, signaling) \n
1863/// 0x02: Less-than-or-equal (ordered, signaling) \n
1864/// 0x03: Unordered (non-signaling) \n
1865/// 0x04: Not-equal (unordered, non-signaling) \n
1866/// 0x05: Not-less-than (unordered, signaling) \n
1867/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1868/// 0x07: Ordered (non-signaling) \n
1869/// 0x08: Equal (unordered, non-signaling) \n
1870/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1871/// 0x0A: Not-greater-than (unordered, signaling) \n
1872/// 0x0B: False (ordered, non-signaling) \n
1873/// 0x0C: Not-equal (ordered, non-signaling) \n
1874/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1875/// 0x0E: Greater-than (ordered, signaling) \n
1876/// 0x0F: True (unordered, non-signaling) \n
1877/// 0x10: Equal (ordered, signaling) \n
1878/// 0x11: Less-than (ordered, non-signaling) \n
1879/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1880/// 0x13: Unordered (signaling) \n
1881/// 0x14: Not-equal (unordered, signaling) \n
1882/// 0x15: Not-less-than (unordered, non-signaling) \n
1883/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1884/// 0x17: Ordered (signaling) \n
1885/// 0x18: Equal (unordered, signaling) \n
1886/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1887/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1888/// 0x1B: False (ordered, signaling) \n
1889/// 0x1C: Not-equal (ordered, signaling) \n
1890/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1891/// 0x1E: Greater-than (ordered, non-signaling) \n
1892/// 0x1F: True (unordered, signaling)
1893/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1894/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1895
1896/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1897/// Compares each of the corresponding scalar values of two 128-bit
1898/// vectors of [4 x float], using the operation specified by the immediate
1899/// integer operand.
1900///
1901/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1902/// If either value in a comparison is NaN, comparisons that are ordered
1903/// return false, and comparisons that are unordered return true.
1904///
1905/// \headerfile <x86intrin.h>
1906///
1907/// \code
1908/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1909/// \endcode
1910///
1911/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1912///
1913/// \param a
1914/// A 128-bit vector of [4 x float].
1915/// \param b
1916/// A 128-bit vector of [4 x float].
1917/// \param c
1918/// An immediate integer operand, with bits [4:0] specifying which comparison
1919/// operation to use: \n
1920/// 0x00: Equal (ordered, non-signaling) \n
1921/// 0x01: Less-than (ordered, signaling) \n
1922/// 0x02: Less-than-or-equal (ordered, signaling) \n
1923/// 0x03: Unordered (non-signaling) \n
1924/// 0x04: Not-equal (unordered, non-signaling) \n
1925/// 0x05: Not-less-than (unordered, signaling) \n
1926/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1927/// 0x07: Ordered (non-signaling) \n
1928/// 0x08: Equal (unordered, non-signaling) \n
1929/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1930/// 0x0A: Not-greater-than (unordered, signaling) \n
1931/// 0x0B: False (ordered, non-signaling) \n
1932/// 0x0C: Not-equal (ordered, non-signaling) \n
1933/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1934/// 0x0E: Greater-than (ordered, signaling) \n
1935/// 0x0F: True (unordered, non-signaling) \n
1936/// 0x10: Equal (ordered, signaling) \n
1937/// 0x11: Less-than (ordered, non-signaling) \n
1938/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1939/// 0x13: Unordered (signaling) \n
1940/// 0x14: Not-equal (unordered, signaling) \n
1941/// 0x15: Not-less-than (unordered, non-signaling) \n
1942/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1943/// 0x17: Ordered (signaling) \n
1944/// 0x18: Equal (unordered, signaling) \n
1945/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1946/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1947/// 0x1B: False (ordered, signaling) \n
1948/// 0x1C: Not-equal (ordered, signaling) \n
1949/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1950/// 0x1E: Greater-than (ordered, non-signaling) \n
1951/// 0x1F: True (unordered, signaling)
1952/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1953/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1954
1955/// Takes a [8 x i32] vector and returns the vector element value
1956/// indexed by the immediate constant operand.
1957///
1958/// \headerfile <x86intrin.h>
1959///
1960/// \code
1961/// int _mm256_extract_epi32(__m256i X, const int N);
1962/// \endcode
1963///
1964/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1965/// instruction.
1966///
1967/// \param X
1968/// A 256-bit vector of [8 x i32].
1969/// \param N
1970/// An immediate integer operand with bits [2:0] determining which vector
1971/// element is extracted and returned.
1972/// \returns A 32-bit integer containing the extracted 32 bits of extended
1973/// packed data.
1974#define _mm256_extract_epi32(X, N) \
1975 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1976
1977/// Takes a [16 x i16] vector and returns the vector element value
1978/// indexed by the immediate constant operand.
1979///
1980/// \headerfile <x86intrin.h>
1981///
1982/// \code
1983/// int _mm256_extract_epi16(__m256i X, const int N);
1984/// \endcode
1985///
1986/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1987/// instruction.
1988///
1989/// \param X
1990/// A 256-bit integer vector of [16 x i16].
1991/// \param N
1992/// An immediate integer operand with bits [3:0] determining which vector
1993/// element is extracted and returned.
1994/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
1995/// packed data.
1996#define _mm256_extract_epi16(X, N) \
1997 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
1998 (int)(N)))
1999
2000/// Takes a [32 x i8] vector and returns the vector element value
2001/// indexed by the immediate constant operand.
2002///
2003/// \headerfile <x86intrin.h>
2004///
2005/// \code
2006/// int _mm256_extract_epi8(__m256i X, const int N);
2007/// \endcode
2008///
2009/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2010/// instruction.
2011///
2012/// \param X
2013/// A 256-bit integer vector of [32 x i8].
2014/// \param N
2015/// An immediate integer operand with bits [4:0] determining which vector
2016/// element is extracted and returned.
2017/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2018/// packed data.
2019#define _mm256_extract_epi8(X, N) \
2020 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2021 (int)(N)))
2022
2023#ifdef __x86_64__
2024/// Takes a [4 x i64] vector and returns the vector element value
2025/// indexed by the immediate constant operand.
2026///
2027/// \headerfile <x86intrin.h>
2028///
2029/// \code
2030/// long long _mm256_extract_epi64(__m256i X, const int N);
2031/// \endcode
2032///
2033/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2034/// instruction.
2035///
2036/// \param X
2037/// A 256-bit integer vector of [4 x i64].
2038/// \param N
2039/// An immediate integer operand with bits [1:0] determining which vector
2040/// element is extracted and returned.
2041/// \returns A 64-bit integer containing the extracted 64 bits of extended
2042/// packed data.
2043#define _mm256_extract_epi64(X, N) \
2044 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2045#endif
2046
2047/// Takes a [8 x i32] vector and replaces the vector element value
2048/// indexed by the immediate constant operand by a new value. Returns the
2049/// modified vector.
2050///
2051/// \headerfile <x86intrin.h>
2052///
2053/// \code
2054/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2055/// \endcode
2056///
2057/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2058/// instruction.
2059///
2060/// \param X
2061/// A vector of [8 x i32] to be used by the insert operation.
2062/// \param I
2063/// An integer value. The replacement value for the insert operation.
2064/// \param N
2065/// An immediate integer specifying the index of the vector element to be
2066/// replaced.
2067/// \returns A copy of vector \a X, after replacing its element indexed by
2068/// \a N with \a I.
2069#define _mm256_insert_epi32(X, I, N) \
2070 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2071 (int)(I), (int)(N)))
2072
2073
2074/// Takes a [16 x i16] vector and replaces the vector element value
2075/// indexed by the immediate constant operand with a new value. Returns the
2076/// modified vector.
2077///
2078/// \headerfile <x86intrin.h>
2079///
2080/// \code
2081/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2082/// \endcode
2083///
2084/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2085/// instruction.
2086///
2087/// \param X
2088/// A vector of [16 x i16] to be used by the insert operation.
2089/// \param I
2090/// An i16 integer value. The replacement value for the insert operation.
2091/// \param N
2092/// An immediate integer specifying the index of the vector element to be
2093/// replaced.
2094/// \returns A copy of vector \a X, after replacing its element indexed by
2095/// \a N with \a I.
2096#define _mm256_insert_epi16(X, I, N) \
2097 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2098 (int)(I), (int)(N)))
2099
2100/// Takes a [32 x i8] vector and replaces the vector element value
2101/// indexed by the immediate constant operand with a new value. Returns the
2102/// modified vector.
2103///
2104/// \headerfile <x86intrin.h>
2105///
2106/// \code
2107/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2108/// \endcode
2109///
2110/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2111/// instruction.
2112///
2113/// \param X
2114/// A vector of [32 x i8] to be used by the insert operation.
2115/// \param I
2116/// An i8 integer value. The replacement value for the insert operation.
2117/// \param N
2118/// An immediate integer specifying the index of the vector element to be
2119/// replaced.
2120/// \returns A copy of vector \a X, after replacing its element indexed by
2121/// \a N with \a I.
2122#define _mm256_insert_epi8(X, I, N) \
2123 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2124 (int)(I), (int)(N)))
2125
2126#ifdef __x86_64__
2127/// Takes a [4 x i64] vector and replaces the vector element value
2128/// indexed by the immediate constant operand with a new value. Returns the
2129/// modified vector.
2130///
2131/// \headerfile <x86intrin.h>
2132///
2133/// \code
2134/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2135/// \endcode
2136///
2137/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2138/// instruction.
2139///
2140/// \param X
2141/// A vector of [4 x i64] to be used by the insert operation.
2142/// \param I
2143/// A 64-bit integer value. The replacement value for the insert operation.
2144/// \param N
2145/// An immediate integer specifying the index of the vector element to be
2146/// replaced.
2147/// \returns A copy of vector \a X, after replacing its element indexed by
2148/// \a N with \a I.
2149#define _mm256_insert_epi64(X, I, N) \
2150 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2151 (long long)(I), (int)(N)))
2152#endif
2153
2154/* Conversion */
2155/// Converts a vector of [4 x i32] into a vector of [4 x double].
2156///
2157/// \headerfile <x86intrin.h>
2158///
2159/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2160///
2161/// \param __a
2162/// A 128-bit integer vector of [4 x i32].
2163/// \returns A 256-bit vector of [4 x double] containing the converted values.
2164static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2166 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2167}
2168
2169/// Converts a vector of [8 x i32] into a vector of [8 x float].
2170///
2171/// \headerfile <x86intrin.h>
2172///
2173/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2174///
2175/// \param __a
2176/// A 256-bit integer vector.
2177/// \returns A 256-bit vector of [8 x float] containing the converted values.
2178static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2180 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2181}
2182
2183/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2184/// [4 x float].
2185///
2186/// \headerfile <x86intrin.h>
2187///
2188/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2189///
2190/// \param __a
2191/// A 256-bit vector of [4 x double].
2192/// \returns A 128-bit vector of [4 x float] containing the converted values.
2193static __inline __m128 __DEFAULT_FN_ATTRS
2195{
2196 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2197}
2198
2199/// Converts a vector of [8 x float] into a vector of [8 x i32].
2200///
2201/// If a converted value does not fit in a 32-bit integer, raises a
2202/// floating-point invalid exception. If the exception is masked, returns
2203/// the most negative integer.
2204///
2205/// \headerfile <x86intrin.h>
2206///
2207/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2208///
2209/// \param __a
2210/// A 256-bit vector of [8 x float].
2211/// \returns A 256-bit integer vector containing the converted values.
2212static __inline __m256i __DEFAULT_FN_ATTRS
2214{
2215 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2216}
2217
2218/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2219/// x double].
2220///
2221/// \headerfile <x86intrin.h>
2222///
2223/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2224///
2225/// \param __a
2226/// A 128-bit vector of [4 x float].
2227/// \returns A 256-bit vector of [4 x double] containing the converted values.
2228static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2230 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2231}
2232
2233/// Converts a 256-bit vector of [4 x double] into four signed truncated
2234/// (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2235/// [4 x i32].
2236///
2237/// If a converted value does not fit in a 32-bit integer, raises a
2238/// floating-point invalid exception. If the exception is masked, returns
2239/// the most negative integer.
2240///
2241/// \headerfile <x86intrin.h>
2242///
2243/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2244///
2245/// \param __a
2246/// A 256-bit vector of [4 x double].
2247/// \returns A 128-bit integer vector containing the converted values.
2248static __inline __m128i __DEFAULT_FN_ATTRS
2250{
2251 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2252}
2253
2254/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2255/// [4 x i32].
2256///
2257/// If a converted value does not fit in a 32-bit integer, raises a
2258/// floating-point invalid exception. If the exception is masked, returns
2259/// the most negative integer.
2260///
2261/// \headerfile <x86intrin.h>
2262///
2263/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2264///
2265/// \param __a
2266/// A 256-bit vector of [4 x double].
2267/// \returns A 128-bit integer vector containing the converted values.
2268static __inline __m128i __DEFAULT_FN_ATTRS
2270{
2271 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2272}
2273
2274/// Converts a vector of [8 x float] into eight signed truncated (rounded
2275/// toward zero) 32-bit integers returned in a vector of [8 x i32].
2276///
2277/// If a converted value does not fit in a 32-bit integer, raises a
2278/// floating-point invalid exception. If the exception is masked, returns
2279/// the most negative integer.
2280///
2281/// \headerfile <x86intrin.h>
2282///
2283/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2284///
2285/// \param __a
2286/// A 256-bit vector of [8 x float].
2287/// \returns A 256-bit integer vector containing the converted values.
2288static __inline __m256i __DEFAULT_FN_ATTRS
2290{
2291 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2292}
2293
2294/// Returns the first element of the input vector of [4 x double].
2295///
2296/// \headerfile <x86intrin.h>
2297///
2298/// This intrinsic is a utility function and does not correspond to a specific
2299/// instruction.
2300///
2301/// \param __a
2302/// A 256-bit vector of [4 x double].
2303/// \returns A 64 bit double containing the first element of the input vector.
2304static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR
2306 return __a[0];
2307}
2308
2309/// Returns the first element of the input vector of [8 x i32].
2310///
2311/// \headerfile <x86intrin.h>
2312///
2313/// This intrinsic is a utility function and does not correspond to a specific
2314/// instruction.
2315///
2316/// \param __a
2317/// A 256-bit vector of [8 x i32].
2318/// \returns A 32 bit integer containing the first element of the input vector.
2319static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2321 __v8si __b = (__v8si)__a;
2322 return __b[0];
2323}
2324
2325/// Returns the first element of the input vector of [8 x float].
2326///
2327/// \headerfile <x86intrin.h>
2328///
2329/// This intrinsic is a utility function and does not correspond to a specific
2330/// instruction.
2331///
2332/// \param __a
2333/// A 256-bit vector of [8 x float].
2334/// \returns A 32 bit float containing the first element of the input vector.
2335static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR
2337 return __a[0];
2338}
2339
2340/* Vector replicate */
2341/// Moves and duplicates odd-indexed values from a 256-bit vector of
2342/// [8 x float] to float values in a 256-bit vector of [8 x float].
2343///
2344/// \headerfile <x86intrin.h>
2345///
2346/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2347///
2348/// \param __a
2349/// A 256-bit vector of [8 x float]. \n
2350/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2351/// the return value. \n
2352/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2353/// the return value. \n
2354/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2355/// return value. \n
2356/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2357/// return value.
2358/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2359/// values.
2360static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2362{
2363 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2364}
2365
2366/// Moves and duplicates even-indexed values from a 256-bit vector of
2367/// [8 x float] to float values in a 256-bit vector of [8 x float].
2368///
2369/// \headerfile <x86intrin.h>
2370///
2371/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2372///
2373/// \param __a
2374/// A 256-bit vector of [8 x float]. \n
2375/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2376/// the return value. \n
2377/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2378/// the return value. \n
2379/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2380/// return value. \n
2381/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2382/// return value.
2383/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2384/// values.
2385static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2387{
2388 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2389}
2390
2391/// Moves and duplicates double-precision floating point values from a
2392/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2393/// vector of [4 x double].
2394///
2395/// \headerfile <x86intrin.h>
2396///
2397/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2398///
2399/// \param __a
2400/// A 256-bit vector of [4 x double]. \n
2401/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2402/// return value. \n
2403/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2404/// the return value.
2405/// \returns A 256-bit vector of [4 x double] containing the moved and
2406/// duplicated values.
2407static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2409{
2410 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2411}
2412
2413/* Unpack and Interleave */
2414/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2415/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2416///
2417/// \headerfile <x86intrin.h>
2418///
2419/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2420///
2421/// \param __a
2422/// A 256-bit floating-point vector of [4 x double]. \n
2423/// Bits [127:64] are written to bits [63:0] of the return value. \n
2424/// Bits [255:192] are written to bits [191:128] of the return value. \n
2425/// \param __b
2426/// A 256-bit floating-point vector of [4 x double]. \n
2427/// Bits [127:64] are written to bits [127:64] of the return value. \n
2428/// Bits [255:192] are written to bits [255:192] of the return value. \n
2429/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2430static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2431_mm256_unpackhi_pd(__m256d __a, __m256d __b) {
2432 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2433}
2434
2435/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2436/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2437///
2438/// \headerfile <x86intrin.h>
2439///
2440/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2441///
2442/// \param __a
2443/// A 256-bit floating-point vector of [4 x double]. \n
2444/// Bits [63:0] are written to bits [63:0] of the return value. \n
2445/// Bits [191:128] are written to bits [191:128] of the return value.
2446/// \param __b
2447/// A 256-bit floating-point vector of [4 x double]. \n
2448/// Bits [63:0] are written to bits [127:64] of the return value. \n
2449/// Bits [191:128] are written to bits [255:192] of the return value. \n
2450/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2451static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2452_mm256_unpacklo_pd(__m256d __a, __m256d __b) {
2453 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2454}
2455
2456/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2457/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2458/// vector of [8 x float].
2459///
2460/// \headerfile <x86intrin.h>
2461///
2462/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2463///
2464/// \param __a
2465/// A 256-bit vector of [8 x float]. \n
2466/// Bits [95:64] are written to bits [31:0] of the return value. \n
2467/// Bits [127:96] are written to bits [95:64] of the return value. \n
2468/// Bits [223:192] are written to bits [159:128] of the return value. \n
2469/// Bits [255:224] are written to bits [223:192] of the return value.
2470/// \param __b
2471/// A 256-bit vector of [8 x float]. \n
2472/// Bits [95:64] are written to bits [63:32] of the return value. \n
2473/// Bits [127:96] are written to bits [127:96] of the return value. \n
2474/// Bits [223:192] are written to bits [191:160] of the return value. \n
2475/// Bits [255:224] are written to bits [255:224] of the return value.
2476/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2477static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2478_mm256_unpackhi_ps(__m256 __a, __m256 __b) {
2479 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2480}
2481
2482/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2483/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2484/// vector of [8 x float].
2485///
2486/// \headerfile <x86intrin.h>
2487///
2488/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2489///
2490/// \param __a
2491/// A 256-bit vector of [8 x float]. \n
2492/// Bits [31:0] are written to bits [31:0] of the return value. \n
2493/// Bits [63:32] are written to bits [95:64] of the return value. \n
2494/// Bits [159:128] are written to bits [159:128] of the return value. \n
2495/// Bits [191:160] are written to bits [223:192] of the return value.
2496/// \param __b
2497/// A 256-bit vector of [8 x float]. \n
2498/// Bits [31:0] are written to bits [63:32] of the return value. \n
2499/// Bits [63:32] are written to bits [127:96] of the return value. \n
2500/// Bits [159:128] are written to bits [191:160] of the return value. \n
2501/// Bits [191:160] are written to bits [255:224] of the return value.
2502/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2503static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2504_mm256_unpacklo_ps(__m256 __a, __m256 __b) {
2505 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2506}
2507
2508/* Bit Test */
2509/// Given two 128-bit floating-point vectors of [2 x double], perform an
2510/// element-by-element comparison of the double-precision element in the
2511/// first source vector and the corresponding element in the second source
2512/// vector.
2513///
2514/// The EFLAGS register is updated as follows: \n
2515/// If there is at least one pair of double-precision elements where the
2516/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2517/// ZF flag is set to 1. \n
2518/// If there is at least one pair of double-precision elements where the
2519/// sign-bit of the first element is 0 and the sign-bit of the second element
2520/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2521/// This intrinsic returns the value of the ZF flag.
2522///
2523/// \headerfile <x86intrin.h>
2524///
2525/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2526///
2527/// \param __a
2528/// A 128-bit vector of [2 x double].
2529/// \param __b
2530/// A 128-bit vector of [2 x double].
2531/// \returns the ZF flag in the EFLAGS register.
2533 __m128d __b) {
2534 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2535}
2536
2537/// Given two 128-bit floating-point vectors of [2 x double], perform an
2538/// element-by-element comparison of the double-precision element in the
2539/// first source vector and the corresponding element in the second source
2540/// vector.
2541///
2542/// The EFLAGS register is updated as follows: \n
2543/// If there is at least one pair of double-precision elements where the
2544/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2545/// ZF flag is set to 1. \n
2546/// If there is at least one pair of double-precision elements where the
2547/// sign-bit of the first element is 0 and the sign-bit of the second element
2548/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2549/// This intrinsic returns the value of the CF flag.
2550///
2551/// \headerfile <x86intrin.h>
2552///
2553/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2554///
2555/// \param __a
2556/// A 128-bit vector of [2 x double].
2557/// \param __b
2558/// A 128-bit vector of [2 x double].
2559/// \returns the CF flag in the EFLAGS register.
2561 __m128d __b) {
2562 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2563}
2564
2565/// Given two 128-bit floating-point vectors of [2 x double], perform an
2566/// element-by-element comparison of the double-precision element in the
2567/// first source vector and the corresponding element in the second source
2568/// vector.
2569///
2570/// The EFLAGS register is updated as follows: \n
2571/// If there is at least one pair of double-precision elements where the
2572/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2573/// ZF flag is set to 1. \n
2574/// If there is at least one pair of double-precision elements where the
2575/// sign-bit of the first element is 0 and the sign-bit of the second element
2576/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2577/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2578/// otherwise it returns 0.
2579///
2580/// \headerfile <x86intrin.h>
2581///
2582/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2583///
2584/// \param __a
2585/// A 128-bit vector of [2 x double].
2586/// \param __b
2587/// A 128-bit vector of [2 x double].
2588/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2589static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR
2590_mm_testnzc_pd(__m128d __a, __m128d __b) {
2591 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2592}
2593
2594/// Given two 128-bit floating-point vectors of [4 x float], perform an
2595/// element-by-element comparison of the single-precision element in the
2596/// first source vector and the corresponding element in the second source
2597/// vector.
2598///
2599/// The EFLAGS register is updated as follows: \n
2600/// If there is at least one pair of single-precision elements where the
2601/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2602/// ZF flag is set to 1. \n
2603/// If there is at least one pair of single-precision elements where the
2604/// sign-bit of the first element is 0 and the sign-bit of the second element
2605/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2606/// This intrinsic returns the value of the ZF flag.
2607///
2608/// \headerfile <x86intrin.h>
2609///
2610/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2611///
2612/// \param __a
2613/// A 128-bit vector of [4 x float].
2614/// \param __b
2615/// A 128-bit vector of [4 x float].
2616/// \returns the ZF flag.
2618 __m128 __b) {
2619 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2620}
2621
2622/// Given two 128-bit floating-point vectors of [4 x float], perform an
2623/// element-by-element comparison of the single-precision element in the
2624/// first source vector and the corresponding element in the second source
2625/// vector.
2626///
2627/// The EFLAGS register is updated as follows: \n
2628/// If there is at least one pair of single-precision elements where the
2629/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2630/// ZF flag is set to 1. \n
2631/// If there is at least one pair of single-precision elements where the
2632/// sign-bit of the first element is 0 and the sign-bit of the second element
2633/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2634/// This intrinsic returns the value of the CF flag.
2635///
2636/// \headerfile <x86intrin.h>
2637///
2638/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2639///
2640/// \param __a
2641/// A 128-bit vector of [4 x float].
2642/// \param __b
2643/// A 128-bit vector of [4 x float].
2644/// \returns the CF flag.
2646 __m128 __b) {
2647 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2648}
2649
2650/// Given two 128-bit floating-point vectors of [4 x float], perform an
2651/// element-by-element comparison of the single-precision element in the
2652/// first source vector and the corresponding element in the second source
2653/// vector.
2654///
2655/// The EFLAGS register is updated as follows: \n
2656/// If there is at least one pair of single-precision elements where the
2657/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2658/// ZF flag is set to 1. \n
2659/// If there is at least one pair of single-precision elements where the
2660/// sign-bit of the first element is 0 and the sign-bit of the second element
2661/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2662/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2663/// otherwise it returns 0.
2664///
2665/// \headerfile <x86intrin.h>
2666///
2667/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2668///
2669/// \param __a
2670/// A 128-bit vector of [4 x float].
2671/// \param __b
2672/// A 128-bit vector of [4 x float].
2673/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2675 __m128 __b) {
2676 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2677}
2678
2679/// Given two 256-bit floating-point vectors of [4 x double], perform an
2680/// element-by-element comparison of the double-precision elements in the
2681/// first source vector and the corresponding elements in the second source
2682/// vector.
2683///
2684/// The EFLAGS register is updated as follows: \n
2685/// If there is at least one pair of double-precision elements where the
2686/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2687/// ZF flag is set to 1. \n
2688/// If there is at least one pair of double-precision elements where the
2689/// sign-bit of the first element is 0 and the sign-bit of the second element
2690/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2691/// This intrinsic returns the value of the ZF flag.
2692///
2693/// \headerfile <x86intrin.h>
2694///
2695/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2696///
2697/// \param __a
2698/// A 256-bit vector of [4 x double].
2699/// \param __b
2700/// A 256-bit vector of [4 x double].
2701/// \returns the ZF flag.
2703 __m256d __b) {
2704 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2705}
2706
2707/// Given two 256-bit floating-point vectors of [4 x double], perform an
2708/// element-by-element comparison of the double-precision elements in the
2709/// first source vector and the corresponding elements in the second source
2710/// vector.
2711///
2712/// The EFLAGS register is updated as follows: \n
2713/// If there is at least one pair of double-precision elements where the
2714/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2715/// ZF flag is set to 1. \n
2716/// If there is at least one pair of double-precision elements where the
2717/// sign-bit of the first element is 0 and the sign-bit of the second element
2718/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2719/// This intrinsic returns the value of the CF flag.
2720///
2721/// \headerfile <x86intrin.h>
2722///
2723/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2724///
2725/// \param __a
2726/// A 256-bit vector of [4 x double].
2727/// \param __b
2728/// A 256-bit vector of [4 x double].
2729/// \returns the CF flag.
2731 __m256d __b) {
2732 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2733}
2734
2735/// Given two 256-bit floating-point vectors of [4 x double], perform an
2736/// element-by-element comparison of the double-precision elements in the
2737/// first source vector and the corresponding elements in the second source
2738/// vector.
2739///
2740/// The EFLAGS register is updated as follows: \n
2741/// If there is at least one pair of double-precision elements where the
2742/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2743/// ZF flag is set to 1. \n
2744/// If there is at least one pair of double-precision elements where the
2745/// sign-bit of the first element is 0 and the sign-bit of the second element
2746/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2747/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2748/// otherwise it returns 0.
2749///
2750/// \headerfile <x86intrin.h>
2751///
2752/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2753///
2754/// \param __a
2755/// A 256-bit vector of [4 x double].
2756/// \param __b
2757/// A 256-bit vector of [4 x double].
2758/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2759static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2760_mm256_testnzc_pd(__m256d __a, __m256d __b) {
2761 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2762}
2763
2764/// Given two 256-bit floating-point vectors of [8 x float], perform an
2765/// element-by-element comparison of the single-precision element in the
2766/// first source vector and the corresponding element in the second source
2767/// vector.
2768///
2769/// The EFLAGS register is updated as follows: \n
2770/// If there is at least one pair of single-precision elements where the
2771/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2772/// ZF flag is set to 1. \n
2773/// If there is at least one pair of single-precision elements where the
2774/// sign-bit of the first element is 0 and the sign-bit of the second element
2775/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2776/// This intrinsic returns the value of the ZF flag.
2777///
2778/// \headerfile <x86intrin.h>
2779///
2780/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2781///
2782/// \param __a
2783/// A 256-bit vector of [8 x float].
2784/// \param __b
2785/// A 256-bit vector of [8 x float].
2786/// \returns the ZF flag.
2788 __m256 __b) {
2789 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2790}
2791
2792/// Given two 256-bit floating-point vectors of [8 x float], perform an
2793/// element-by-element comparison of the single-precision element in the
2794/// first source vector and the corresponding element in the second source
2795/// vector.
2796///
2797/// The EFLAGS register is updated as follows: \n
2798/// If there is at least one pair of single-precision elements where the
2799/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2800/// ZF flag is set to 1. \n
2801/// If there is at least one pair of single-precision elements where the
2802/// sign-bit of the first element is 0 and the sign-bit of the second element
2803/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2804/// This intrinsic returns the value of the CF flag.
2805///
2806/// \headerfile <x86intrin.h>
2807///
2808/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2809///
2810/// \param __a
2811/// A 256-bit vector of [8 x float].
2812/// \param __b
2813/// A 256-bit vector of [8 x float].
2814/// \returns the CF flag.
2816 __m256 __b) {
2817 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2818}
2819
2820/// Given two 256-bit floating-point vectors of [8 x float], perform an
2821/// element-by-element comparison of the single-precision elements in the
2822/// first source vector and the corresponding elements in the second source
2823/// vector.
2824///
2825/// The EFLAGS register is updated as follows: \n
2826/// If there is at least one pair of single-precision elements where the
2827/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2828/// ZF flag is set to 1. \n
2829/// If there is at least one pair of single-precision elements where the
2830/// sign-bit of the first element is 0 and the sign-bit of the second element
2831/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2832/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2833/// otherwise it returns 0.
2834///
2835/// \headerfile <x86intrin.h>
2836///
2837/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2838///
2839/// \param __a
2840/// A 256-bit vector of [8 x float].
2841/// \param __b
2842/// A 256-bit vector of [8 x float].
2843/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2845 __m256 __b) {
2846 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2847}
2848
2849/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2850/// of the two source vectors.
2851///
2852/// The EFLAGS register is updated as follows: \n
2853/// If there is at least one pair of bits where both bits are 1, the ZF flag
2854/// is set to 0. Otherwise the ZF flag is set to 1. \n
2855/// If there is at least one pair of bits where the bit from the first source
2856/// vector is 0 and the bit from the second source vector is 1, the CF flag
2857/// is set to 0. Otherwise the CF flag is set to 1. \n
2858/// This intrinsic returns the value of the ZF flag.
2859///
2860/// \headerfile <x86intrin.h>
2861///
2862/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2863///
2864/// \param __a
2865/// A 256-bit integer vector.
2866/// \param __b
2867/// A 256-bit integer vector.
2868/// \returns the ZF flag.
2869static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2870_mm256_testz_si256(__m256i __a, __m256i __b) {
2871 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2872}
2873
2874/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2875/// of the two source vectors.
2876///
2877/// The EFLAGS register is updated as follows: \n
2878/// If there is at least one pair of bits where both bits are 1, the ZF flag
2879/// is set to 0. Otherwise the ZF flag is set to 1. \n
2880/// If there is at least one pair of bits where the bit from the first source
2881/// vector is 0 and the bit from the second source vector is 1, the CF flag
2882/// is set to 0. Otherwise the CF flag is set to 1. \n
2883/// This intrinsic returns the value of the CF flag.
2884///
2885/// \headerfile <x86intrin.h>
2886///
2887/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2888///
2889/// \param __a
2890/// A 256-bit integer vector.
2891/// \param __b
2892/// A 256-bit integer vector.
2893/// \returns the CF flag.
2894static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2895_mm256_testc_si256(__m256i __a, __m256i __b) {
2896 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2897}
2898
2899/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2900/// of the two source vectors.
2901///
2902/// The EFLAGS register is updated as follows: \n
2903/// If there is at least one pair of bits where both bits are 1, the ZF flag
2904/// is set to 0. Otherwise the ZF flag is set to 1. \n
2905/// If there is at least one pair of bits where the bit from the first source
2906/// vector is 0 and the bit from the second source vector is 1, the CF flag
2907/// is set to 0. Otherwise the CF flag is set to 1. \n
2908/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2909/// otherwise it returns 0.
2910///
2911/// \headerfile <x86intrin.h>
2912///
2913/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2914///
2915/// \param __a
2916/// A 256-bit integer vector.
2917/// \param __b
2918/// A 256-bit integer vector.
2919/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2920static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2921_mm256_testnzc_si256(__m256i __a, __m256i __b) {
2922 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2923}
2924
2925/* Vector extract sign mask */
2926/// Extracts the sign bits of double-precision floating point elements
2927/// in a 256-bit vector of [4 x double] and writes them to the lower order
2928/// bits of the return value.
2929///
2930/// \headerfile <x86intrin.h>
2931///
2932/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2933///
2934/// \param __a
2935/// A 256-bit vector of [4 x double] containing the double-precision
2936/// floating point values with sign bits to be extracted.
2937/// \returns The sign bits from the operand, written to bits [3:0].
2938static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2940 return __builtin_ia32_movmskpd256((__v4df)__a);
2941}
2942
2943/// Extracts the sign bits of single-precision floating point elements
2944/// in a 256-bit vector of [8 x float] and writes them to the lower order
2945/// bits of the return value.
2946///
2947/// \headerfile <x86intrin.h>
2948///
2949/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2950///
2951/// \param __a
2952/// A 256-bit vector of [8 x float] containing the single-precision floating
2953/// point values with sign bits to be extracted.
2954/// \returns The sign bits from the operand, written to bits [7:0].
2955static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2957 return __builtin_ia32_movmskps256((__v8sf)__a);
2958}
2959
2960/* Vector __zero */
2961/// Zeroes the contents of all XMM or YMM registers.
2962///
2963/// \headerfile <x86intrin.h>
2964///
2965/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2966static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2967_mm256_zeroall(void)
2968{
2969 __builtin_ia32_vzeroall();
2970}
2971
2972/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2973///
2974/// \headerfile <x86intrin.h>
2975///
2976/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2977static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2978_mm256_zeroupper(void)
2979{
2980 __builtin_ia32_vzeroupper();
2981}
2982
2983/* Vector load with broadcast */
2984/// Loads a scalar single-precision floating point value from the
2985/// specified address pointed to by \a __a and broadcasts it to the elements
2986/// of a [4 x float] vector.
2987///
2988/// \headerfile <x86intrin.h>
2989///
2990/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2991///
2992/// \param __a
2993/// The single-precision floating point value to be broadcast.
2994/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2995/// equal to the broadcast value.
2996static __inline __m128 __DEFAULT_FN_ATTRS128
2998{
2999 struct __mm_broadcast_ss_struct {
3000 float __f;
3001 } __attribute__((__packed__, __may_alias__));
3002 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3003 return __extension__ (__m128){ __f, __f, __f, __f };
3004}
3005
3006/// Loads a scalar double-precision floating point value from the
3007/// specified address pointed to by \a __a and broadcasts it to the elements
3008/// of a [4 x double] vector.
3009///
3010/// \headerfile <x86intrin.h>
3011///
3012/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3013///
3014/// \param __a
3015/// The double-precision floating point value to be broadcast.
3016/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3017/// equal to the broadcast value.
3018static __inline __m256d __DEFAULT_FN_ATTRS
3020{
3021 struct __mm256_broadcast_sd_struct {
3022 double __d;
3023 } __attribute__((__packed__, __may_alias__));
3024 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3025 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3026}
3027
3028/// Loads a scalar single-precision floating point value from the
3029/// specified address pointed to by \a __a and broadcasts it to the elements
3030/// of a [8 x float] vector.
3031///
3032/// \headerfile <x86intrin.h>
3033///
3034/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3035///
3036/// \param __a
3037/// The single-precision floating point value to be broadcast.
3038/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3039/// equal to the broadcast value.
3040static __inline __m256 __DEFAULT_FN_ATTRS
3042{
3043 struct __mm256_broadcast_ss_struct {
3044 float __f;
3045 } __attribute__((__packed__, __may_alias__));
3046 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3047 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3048}
3049
3050/// Loads the data from a 128-bit vector of [2 x double] from the
3051/// specified address pointed to by \a __a and broadcasts it to 128-bit
3052/// elements in a 256-bit vector of [4 x double].
3053///
3054/// \headerfile <x86intrin.h>
3055///
3056/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3057///
3058/// \param __a
3059/// The 128-bit vector of [2 x double] to be broadcast.
3060/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3061/// equal to the broadcast value.
3062static __inline __m256d __DEFAULT_FN_ATTRS
3064{
3065 __m128d __b = _mm_loadu_pd((const double *)__a);
3066 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3067 0, 1, 0, 1);
3068}
3069
3070/// Loads the data from a 128-bit vector of [4 x float] from the
3071/// specified address pointed to by \a __a and broadcasts it to 128-bit
3072/// elements in a 256-bit vector of [8 x float].
3073///
3074/// \headerfile <x86intrin.h>
3075///
3076/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3077///
3078/// \param __a
3079/// The 128-bit vector of [4 x float] to be broadcast.
3080/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3081/// equal to the broadcast value.
3082static __inline __m256 __DEFAULT_FN_ATTRS
3084{
3085 __m128 __b = _mm_loadu_ps((const float *)__a);
3086 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3087 0, 1, 2, 3, 0, 1, 2, 3);
3088}
3089
3090/* SIMD load ops */
3091/// Loads 4 double-precision floating point values from a 32-byte aligned
3092/// memory location pointed to by \a __p into a vector of [4 x double].
3093///
3094/// \headerfile <x86intrin.h>
3095///
3096/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3097///
3098/// \param __p
3099/// A 32-byte aligned pointer to a memory location containing
3100/// double-precision floating point values.
3101/// \returns A 256-bit vector of [4 x double] containing the moved values.
3102static __inline __m256d __DEFAULT_FN_ATTRS
3103_mm256_load_pd(double const *__p)
3104{
3105 return *(const __m256d *)__p;
3106}
3107
3108/// Loads 8 single-precision floating point values from a 32-byte aligned
3109/// memory location pointed to by \a __p into a vector of [8 x float].
3110///
3111/// \headerfile <x86intrin.h>
3112///
3113/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3114///
3115/// \param __p
3116/// A 32-byte aligned pointer to a memory location containing float values.
3117/// \returns A 256-bit vector of [8 x float] containing the moved values.
3118static __inline __m256 __DEFAULT_FN_ATTRS
3119_mm256_load_ps(float const *__p)
3120{
3121 return *(const __m256 *)__p;
3122}
3123
3124/// Loads 4 double-precision floating point values from an unaligned
3125/// memory location pointed to by \a __p into a vector of [4 x double].
3126///
3127/// \headerfile <x86intrin.h>
3128///
3129/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3130///
3131/// \param __p
3132/// A pointer to a memory location containing double-precision floating
3133/// point values.
3134/// \returns A 256-bit vector of [4 x double] containing the moved values.
3135static __inline __m256d __DEFAULT_FN_ATTRS
3136_mm256_loadu_pd(double const *__p)
3137{
3138 struct __loadu_pd {
3139 __m256d_u __v;
3140 } __attribute__((__packed__, __may_alias__));
3141 return ((const struct __loadu_pd*)__p)->__v;
3142}
3143
3144/// Loads 8 single-precision floating point values from an unaligned
3145/// memory location pointed to by \a __p into a vector of [8 x float].
3146///
3147/// \headerfile <x86intrin.h>
3148///
3149/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3150///
3151/// \param __p
3152/// A pointer to a memory location containing single-precision floating
3153/// point values.
3154/// \returns A 256-bit vector of [8 x float] containing the moved values.
3155static __inline __m256 __DEFAULT_FN_ATTRS
3157{
3158 struct __loadu_ps {
3159 __m256_u __v;
3160 } __attribute__((__packed__, __may_alias__));
3161 return ((const struct __loadu_ps*)__p)->__v;
3162}
3163
3164/// Loads 256 bits of integer data from a 32-byte aligned memory
3165/// location pointed to by \a __p into elements of a 256-bit integer vector.
3166///
3167/// \headerfile <x86intrin.h>
3168///
3169/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3170///
3171/// \param __p
3172/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3173/// values.
3174/// \returns A 256-bit integer vector containing the moved values.
3175static __inline __m256i __DEFAULT_FN_ATTRS
3176_mm256_load_si256(__m256i const *__p)
3177{
3178 return *__p;
3179}
3180
3181/// Loads 256 bits of integer data from an unaligned memory location
3182/// pointed to by \a __p into a 256-bit integer vector.
3183///
3184/// \headerfile <x86intrin.h>
3185///
3186/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3187///
3188/// \param __p
3189/// A pointer to a 256-bit integer vector containing integer values.
3190/// \returns A 256-bit integer vector containing the moved values.
3191static __inline __m256i __DEFAULT_FN_ATTRS
3192_mm256_loadu_si256(__m256i_u const *__p)
3193{
3194 struct __loadu_si256 {
3195 __m256i_u __v;
3196 } __attribute__((__packed__, __may_alias__));
3197 return ((const struct __loadu_si256*)__p)->__v;
3198}
3199
3200/// Loads 256 bits of integer data from an unaligned memory location
3201/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3202/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3203/// line boundary.
3204///
3205/// \headerfile <x86intrin.h>
3206///
3207/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3208///
3209/// \param __p
3210/// A pointer to a 256-bit integer vector containing integer values.
3211/// \returns A 256-bit integer vector containing the moved values.
3212static __inline __m256i __DEFAULT_FN_ATTRS
3213_mm256_lddqu_si256(__m256i_u const *__p)
3214{
3215 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3216}
3217
3218/* SIMD store ops */
3219/// Stores double-precision floating point values from a 256-bit vector
3220/// of [4 x double] to a 32-byte aligned memory location pointed to by
3221/// \a __p.
3222///
3223/// \headerfile <x86intrin.h>
3224///
3225/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3226///
3227/// \param __p
3228/// A 32-byte aligned pointer to a memory location that will receive the
3229/// double-precision floaing point values.
3230/// \param __a
3231/// A 256-bit vector of [4 x double] containing the values to be moved.
3232static __inline void __DEFAULT_FN_ATTRS
3233_mm256_store_pd(double *__p, __m256d __a)
3234{
3235 *(__m256d *)__p = __a;
3236}
3237
3238/// Stores single-precision floating point values from a 256-bit vector
3239/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3240///
3241/// \headerfile <x86intrin.h>
3242///
3243/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3244///
3245/// \param __p
3246/// A 32-byte aligned pointer to a memory location that will receive the
3247/// float values.
3248/// \param __a
3249/// A 256-bit vector of [8 x float] containing the values to be moved.
3250static __inline void __DEFAULT_FN_ATTRS
3251_mm256_store_ps(float *__p, __m256 __a)
3252{
3253 *(__m256 *)__p = __a;
3254}
3255
3256/// Stores double-precision floating point values from a 256-bit vector
3257/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3258///
3259/// \headerfile <x86intrin.h>
3260///
3261/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3262///
3263/// \param __p
3264/// A pointer to a memory location that will receive the double-precision
3265/// floating point values.
3266/// \param __a
3267/// A 256-bit vector of [4 x double] containing the values to be moved.
3268static __inline void __DEFAULT_FN_ATTRS
3269_mm256_storeu_pd(double *__p, __m256d __a)
3270{
3271 struct __storeu_pd {
3272 __m256d_u __v;
3273 } __attribute__((__packed__, __may_alias__));
3274 ((struct __storeu_pd*)__p)->__v = __a;
3275}
3276
3277/// Stores single-precision floating point values from a 256-bit vector
3278/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3279///
3280/// \headerfile <x86intrin.h>
3281///
3282/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3283///
3284/// \param __p
3285/// A pointer to a memory location that will receive the float values.
3286/// \param __a
3287/// A 256-bit vector of [8 x float] containing the values to be moved.
3288static __inline void __DEFAULT_FN_ATTRS
3289_mm256_storeu_ps(float *__p, __m256 __a)
3290{
3291 struct __storeu_ps {
3292 __m256_u __v;
3293 } __attribute__((__packed__, __may_alias__));
3294 ((struct __storeu_ps*)__p)->__v = __a;
3295}
3296
3297/// Stores integer values from a 256-bit integer vector to a 32-byte
3298/// aligned memory location pointed to by \a __p.
3299///
3300/// \headerfile <x86intrin.h>
3301///
3302/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3303///
3304/// \param __p
3305/// A 32-byte aligned pointer to a memory location that will receive the
3306/// integer values.
3307/// \param __a
3308/// A 256-bit integer vector containing the values to be moved.
3309static __inline void __DEFAULT_FN_ATTRS
3310_mm256_store_si256(__m256i *__p, __m256i __a)
3311{
3312 *__p = __a;
3313}
3314
3315/// Stores integer values from a 256-bit integer vector to an unaligned
3316/// memory location pointed to by \a __p.
3317///
3318/// \headerfile <x86intrin.h>
3319///
3320/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3321///
3322/// \param __p
3323/// A pointer to a memory location that will receive the integer values.
3324/// \param __a
3325/// A 256-bit integer vector containing the values to be moved.
3326static __inline void __DEFAULT_FN_ATTRS
3327_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3328{
3329 struct __storeu_si256 {
3330 __m256i_u __v;
3331 } __attribute__((__packed__, __may_alias__));
3332 ((struct __storeu_si256*)__p)->__v = __a;
3333}
3334
3335/* Conditional load ops */
3336/// Conditionally loads double-precision floating point elements from a
3337/// memory location pointed to by \a __p into a 128-bit vector of
3338/// [2 x double], depending on the mask bits associated with each data
3339/// element.
3340///
3341/// \headerfile <x86intrin.h>
3342///
3343/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3344///
3345/// \param __p
3346/// A pointer to a memory location that contains the double-precision
3347/// floating point values.
3348/// \param __m
3349/// A 128-bit integer vector containing the mask. The most significant bit of
3350/// each data element represents the mask bits. If a mask bit is zero, the
3351/// corresponding value in the memory location is not loaded and the
3352/// corresponding field in the return value is set to zero.
3353/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3354static __inline __m128d __DEFAULT_FN_ATTRS128
3355_mm_maskload_pd(double const *__p, __m128i __m)
3356{
3357 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3358}
3359
3360/// Conditionally loads double-precision floating point elements from a
3361/// memory location pointed to by \a __p into a 256-bit vector of
3362/// [4 x double], depending on the mask bits associated with each data
3363/// element.
3364///
3365/// \headerfile <x86intrin.h>
3366///
3367/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3368///
3369/// \param __p
3370/// A pointer to a memory location that contains the double-precision
3371/// floating point values.
3372/// \param __m
3373/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3374/// significant bit of each quadword element represents the mask bits. If a
3375/// mask bit is zero, the corresponding value in the memory location is not
3376/// loaded and the corresponding field in the return value is set to zero.
3377/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3378static __inline __m256d __DEFAULT_FN_ATTRS
3379_mm256_maskload_pd(double const *__p, __m256i __m)
3380{
3381 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3382 (__v4di)__m);
3383}
3384
3385/// Conditionally loads single-precision floating point elements from a
3386/// memory location pointed to by \a __p into a 128-bit vector of
3387/// [4 x float], depending on the mask bits associated with each data
3388/// element.
3389///
3390/// \headerfile <x86intrin.h>
3391///
3392/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3393///
3394/// \param __p
3395/// A pointer to a memory location that contains the single-precision
3396/// floating point values.
3397/// \param __m
3398/// A 128-bit integer vector containing the mask. The most significant bit of
3399/// each data element represents the mask bits. If a mask bit is zero, the
3400/// corresponding value in the memory location is not loaded and the
3401/// corresponding field in the return value is set to zero.
3402/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3403static __inline __m128 __DEFAULT_FN_ATTRS128
3404_mm_maskload_ps(float const *__p, __m128i __m)
3405{
3406 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3407}
3408
3409/// Conditionally loads single-precision floating point elements from a
3410/// memory location pointed to by \a __p into a 256-bit vector of
3411/// [8 x float], depending on the mask bits associated with each data
3412/// element.
3413///
3414/// \headerfile <x86intrin.h>
3415///
3416/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3417///
3418/// \param __p
3419/// A pointer to a memory location that contains the single-precision
3420/// floating point values.
3421/// \param __m
3422/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3423/// significant bit of each dword element represents the mask bits. If a mask
3424/// bit is zero, the corresponding value in the memory location is not loaded
3425/// and the corresponding field in the return value is set to zero.
3426/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3427static __inline __m256 __DEFAULT_FN_ATTRS
3428_mm256_maskload_ps(float const *__p, __m256i __m)
3429{
3430 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3431}
3432
3433/* Conditional store ops */
3434/// Moves single-precision floating point values from a 256-bit vector
3435/// of [8 x float] to a memory location pointed to by \a __p, according to
3436/// the specified mask.
3437///
3438/// \headerfile <x86intrin.h>
3439///
3440/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3441///
3442/// \param __p
3443/// A pointer to a memory location that will receive the float values.
3444/// \param __m
3445/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3446/// significant bit of each dword element in the mask vector represents the
3447/// mask bits. If a mask bit is zero, the corresponding value from vector
3448/// \a __a is not stored and the corresponding field in the memory location
3449/// pointed to by \a __p is not changed.
3450/// \param __a
3451/// A 256-bit vector of [8 x float] containing the values to be stored.
3452static __inline void __DEFAULT_FN_ATTRS
3453_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3454{
3455 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3456}
3457
3458/// Moves double-precision values from a 128-bit vector of [2 x double]
3459/// to a memory location pointed to by \a __p, according to the specified
3460/// mask.
3461///
3462/// \headerfile <x86intrin.h>
3463///
3464/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3465///
3466/// \param __p
3467/// A pointer to a memory location that will receive the float values.
3468/// \param __m
3469/// A 128-bit integer vector containing the mask. The most significant bit of
3470/// each field in the mask vector represents the mask bits. If a mask bit is
3471/// zero, the corresponding value from vector \a __a is not stored and the
3472/// corresponding field in the memory location pointed to by \a __p is not
3473/// changed.
3474/// \param __a
3475/// A 128-bit vector of [2 x double] containing the values to be stored.
3476static __inline void __DEFAULT_FN_ATTRS128
3477_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3478{
3479 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3480}
3481
3482/// Moves double-precision values from a 256-bit vector of [4 x double]
3483/// to a memory location pointed to by \a __p, according to the specified
3484/// mask.
3485///
3486/// \headerfile <x86intrin.h>
3487///
3488/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3489///
3490/// \param __p
3491/// A pointer to a memory location that will receive the float values.
3492/// \param __m
3493/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3494/// significant bit of each quadword element in the mask vector represents
3495/// the mask bits. If a mask bit is zero, the corresponding value from vector
3496/// __a is not stored and the corresponding field in the memory location
3497/// pointed to by \a __p is not changed.
3498/// \param __a
3499/// A 256-bit vector of [4 x double] containing the values to be stored.
3500static __inline void __DEFAULT_FN_ATTRS
3501_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3502{
3503 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3504}
3505
3506/// Moves single-precision floating point values from a 128-bit vector
3507/// of [4 x float] to a memory location pointed to by \a __p, according to
3508/// the specified mask.
3509///
3510/// \headerfile <x86intrin.h>
3511///
3512/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3513///
3514/// \param __p
3515/// A pointer to a memory location that will receive the float values.
3516/// \param __m
3517/// A 128-bit integer vector containing the mask. The most significant bit of
3518/// each field in the mask vector represents the mask bits. If a mask bit is
3519/// zero, the corresponding value from vector __a is not stored and the
3520/// corresponding field in the memory location pointed to by \a __p is not
3521/// changed.
3522/// \param __a
3523/// A 128-bit vector of [4 x float] containing the values to be stored.
3524static __inline void __DEFAULT_FN_ATTRS128
3525_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3526{
3527 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3528}
3529
3530/* Cacheability support ops */
3531/// Moves integer data from a 256-bit integer vector to a 32-byte
3532/// aligned memory location. To minimize caching, the data is flagged as
3533/// non-temporal (unlikely to be used again soon).
3534///
3535/// \headerfile <x86intrin.h>
3536///
3537/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3538///
3539/// \param __a
3540/// A pointer to a 32-byte aligned memory location that will receive the
3541/// integer values.
3542/// \param __b
3543/// A 256-bit integer vector containing the values to be moved.
3544static __inline void __DEFAULT_FN_ATTRS
3546{
3547 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3548 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3549}
3550
3551/// Moves double-precision values from a 256-bit vector of [4 x double]
3552/// to a 32-byte aligned memory location. To minimize caching, the data is
3553/// flagged as non-temporal (unlikely to be used again soon).
3554///
3555/// \headerfile <x86intrin.h>
3556///
3557/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3558///
3559/// \param __a
3560/// A pointer to a 32-byte aligned memory location that will receive the
3561/// double-precision floating-point values.
3562/// \param __b
3563/// A 256-bit vector of [4 x double] containing the values to be moved.
3564static __inline void __DEFAULT_FN_ATTRS
3565_mm256_stream_pd(void *__a, __m256d __b)
3566{
3567 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3568 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3569}
3570
3571/// Moves single-precision floating point values from a 256-bit vector
3572/// of [8 x float] to a 32-byte aligned memory location. To minimize
3573/// caching, the data is flagged as non-temporal (unlikely to be used again
3574/// soon).
3575///
3576/// \headerfile <x86intrin.h>
3577///
3578/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3579///
3580/// \param __p
3581/// A pointer to a 32-byte aligned memory location that will receive the
3582/// single-precision floating point values.
3583/// \param __a
3584/// A 256-bit vector of [8 x float] containing the values to be moved.
3585static __inline void __DEFAULT_FN_ATTRS
3587{
3588 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3589 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3590}
3591
3592/* Create vectors */
3593/// Create a 256-bit vector of [4 x double] with undefined values.
3594///
3595/// \headerfile <x86intrin.h>
3596///
3597/// This intrinsic has no corresponding instruction.
3598///
3599/// \returns A 256-bit vector of [4 x double] containing undefined values.
3600static __inline__ __m256d __DEFAULT_FN_ATTRS
3602{
3603 return (__m256d)__builtin_ia32_undef256();
3604}
3605
3606/// Create a 256-bit vector of [8 x float] with undefined values.
3607///
3608/// \headerfile <x86intrin.h>
3609///
3610/// This intrinsic has no corresponding instruction.
3611///
3612/// \returns A 256-bit vector of [8 x float] containing undefined values.
3613static __inline__ __m256 __DEFAULT_FN_ATTRS
3615{
3616 return (__m256)__builtin_ia32_undef256();
3617}
3618
3619/// Create a 256-bit integer vector with undefined values.
3620///
3621/// \headerfile <x86intrin.h>
3622///
3623/// This intrinsic has no corresponding instruction.
3624///
3625/// \returns A 256-bit integer vector containing undefined values.
3626static __inline__ __m256i __DEFAULT_FN_ATTRS
3628{
3629 return (__m256i)__builtin_ia32_undef256();
3630}
3631
3632/// Constructs a 256-bit floating-point vector of [4 x double]
3633/// initialized with the specified double-precision floating-point values.
3634///
3635/// \headerfile <x86intrin.h>
3636///
3637/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3638/// instruction.
3639///
3640/// \param __a
3641/// A double-precision floating-point value used to initialize bits [255:192]
3642/// of the result.
3643/// \param __b
3644/// A double-precision floating-point value used to initialize bits [191:128]
3645/// of the result.
3646/// \param __c
3647/// A double-precision floating-point value used to initialize bits [127:64]
3648/// of the result.
3649/// \param __d
3650/// A double-precision floating-point value used to initialize bits [63:0]
3651/// of the result.
3652/// \returns An initialized 256-bit floating-point vector of [4 x double].
3653static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3654_mm256_set_pd(double __a, double __b, double __c, double __d)
3655{
3656 return __extension__ (__m256d){ __d, __c, __b, __a };
3657}
3658
3659/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3660/// with the specified single-precision floating-point values.
3661///
3662/// \headerfile <x86intrin.h>
3663///
3664/// This intrinsic is a utility function and does not correspond to a specific
3665/// instruction.
3666///
3667/// \param __a
3668/// A single-precision floating-point value used to initialize bits [255:224]
3669/// of the result.
3670/// \param __b
3671/// A single-precision floating-point value used to initialize bits [223:192]
3672/// of the result.
3673/// \param __c
3674/// A single-precision floating-point value used to initialize bits [191:160]
3675/// of the result.
3676/// \param __d
3677/// A single-precision floating-point value used to initialize bits [159:128]
3678/// of the result.
3679/// \param __e
3680/// A single-precision floating-point value used to initialize bits [127:96]
3681/// of the result.
3682/// \param __f
3683/// A single-precision floating-point value used to initialize bits [95:64]
3684/// of the result.
3685/// \param __g
3686/// A single-precision floating-point value used to initialize bits [63:32]
3687/// of the result.
3688/// \param __h
3689/// A single-precision floating-point value used to initialize bits [31:0]
3690/// of the result.
3691/// \returns An initialized 256-bit floating-point vector of [8 x float].
3692static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3693_mm256_set_ps(float __a, float __b, float __c, float __d,
3694 float __e, float __f, float __g, float __h)
3695{
3696 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3697}
3698
3699/// Constructs a 256-bit integer vector initialized with the specified
3700/// 32-bit integral values.
3701///
3702/// \headerfile <x86intrin.h>
3703///
3704/// This intrinsic is a utility function and does not correspond to a specific
3705/// instruction.
3706///
3707/// \param __i0
3708/// A 32-bit integral value used to initialize bits [255:224] of the result.
3709/// \param __i1
3710/// A 32-bit integral value used to initialize bits [223:192] of the result.
3711/// \param __i2
3712/// A 32-bit integral value used to initialize bits [191:160] of the result.
3713/// \param __i3
3714/// A 32-bit integral value used to initialize bits [159:128] of the result.
3715/// \param __i4
3716/// A 32-bit integral value used to initialize bits [127:96] of the result.
3717/// \param __i5
3718/// A 32-bit integral value used to initialize bits [95:64] of the result.
3719/// \param __i6
3720/// A 32-bit integral value used to initialize bits [63:32] of the result.
3721/// \param __i7
3722/// A 32-bit integral value used to initialize bits [31:0] of the result.
3723/// \returns An initialized 256-bit integer vector.
3724static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3725_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3726 int __i4, int __i5, int __i6, int __i7)
3727{
3728 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3729}
3730
3731/// Constructs a 256-bit integer vector initialized with the specified
3732/// 16-bit integral values.
3733///
3734/// \headerfile <x86intrin.h>
3735///
3736/// This intrinsic is a utility function and does not correspond to a specific
3737/// instruction.
3738///
3739/// \param __w15
3740/// A 16-bit integral value used to initialize bits [255:240] of the result.
3741/// \param __w14
3742/// A 16-bit integral value used to initialize bits [239:224] of the result.
3743/// \param __w13
3744/// A 16-bit integral value used to initialize bits [223:208] of the result.
3745/// \param __w12
3746/// A 16-bit integral value used to initialize bits [207:192] of the result.
3747/// \param __w11
3748/// A 16-bit integral value used to initialize bits [191:176] of the result.
3749/// \param __w10
3750/// A 16-bit integral value used to initialize bits [175:160] of the result.
3751/// \param __w09
3752/// A 16-bit integral value used to initialize bits [159:144] of the result.
3753/// \param __w08
3754/// A 16-bit integral value used to initialize bits [143:128] of the result.
3755/// \param __w07
3756/// A 16-bit integral value used to initialize bits [127:112] of the result.
3757/// \param __w06
3758/// A 16-bit integral value used to initialize bits [111:96] of the result.
3759/// \param __w05
3760/// A 16-bit integral value used to initialize bits [95:80] of the result.
3761/// \param __w04
3762/// A 16-bit integral value used to initialize bits [79:64] of the result.
3763/// \param __w03
3764/// A 16-bit integral value used to initialize bits [63:48] of the result.
3765/// \param __w02
3766/// A 16-bit integral value used to initialize bits [47:32] of the result.
3767/// \param __w01
3768/// A 16-bit integral value used to initialize bits [31:16] of the result.
3769/// \param __w00
3770/// A 16-bit integral value used to initialize bits [15:0] of the result.
3771/// \returns An initialized 256-bit integer vector.
3772static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3773_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3774 short __w11, short __w10, short __w09, short __w08,
3775 short __w07, short __w06, short __w05, short __w04,
3776 short __w03, short __w02, short __w01, short __w00)
3777{
3778 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3779 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3780}
3781
3782/// Constructs a 256-bit integer vector initialized with the specified
3783/// 8-bit integral values.
3784///
3785/// \headerfile <x86intrin.h>
3786///
3787/// This intrinsic is a utility function and does not correspond to a specific
3788/// instruction.
3789///
3790/// \param __b31
3791/// An 8-bit integral value used to initialize bits [255:248] of the result.
3792/// \param __b30
3793/// An 8-bit integral value used to initialize bits [247:240] of the result.
3794/// \param __b29
3795/// An 8-bit integral value used to initialize bits [239:232] of the result.
3796/// \param __b28
3797/// An 8-bit integral value used to initialize bits [231:224] of the result.
3798/// \param __b27
3799/// An 8-bit integral value used to initialize bits [223:216] of the result.
3800/// \param __b26
3801/// An 8-bit integral value used to initialize bits [215:208] of the result.
3802/// \param __b25
3803/// An 8-bit integral value used to initialize bits [207:200] of the result.
3804/// \param __b24
3805/// An 8-bit integral value used to initialize bits [199:192] of the result.
3806/// \param __b23
3807/// An 8-bit integral value used to initialize bits [191:184] of the result.
3808/// \param __b22
3809/// An 8-bit integral value used to initialize bits [183:176] of the result.
3810/// \param __b21
3811/// An 8-bit integral value used to initialize bits [175:168] of the result.
3812/// \param __b20
3813/// An 8-bit integral value used to initialize bits [167:160] of the result.
3814/// \param __b19
3815/// An 8-bit integral value used to initialize bits [159:152] of the result.
3816/// \param __b18
3817/// An 8-bit integral value used to initialize bits [151:144] of the result.
3818/// \param __b17
3819/// An 8-bit integral value used to initialize bits [143:136] of the result.
3820/// \param __b16
3821/// An 8-bit integral value used to initialize bits [135:128] of the result.
3822/// \param __b15
3823/// An 8-bit integral value used to initialize bits [127:120] of the result.
3824/// \param __b14
3825/// An 8-bit integral value used to initialize bits [119:112] of the result.
3826/// \param __b13
3827/// An 8-bit integral value used to initialize bits [111:104] of the result.
3828/// \param __b12
3829/// An 8-bit integral value used to initialize bits [103:96] of the result.
3830/// \param __b11
3831/// An 8-bit integral value used to initialize bits [95:88] of the result.
3832/// \param __b10
3833/// An 8-bit integral value used to initialize bits [87:80] of the result.
3834/// \param __b09
3835/// An 8-bit integral value used to initialize bits [79:72] of the result.
3836/// \param __b08
3837/// An 8-bit integral value used to initialize bits [71:64] of the result.
3838/// \param __b07
3839/// An 8-bit integral value used to initialize bits [63:56] of the result.
3840/// \param __b06
3841/// An 8-bit integral value used to initialize bits [55:48] of the result.
3842/// \param __b05
3843/// An 8-bit integral value used to initialize bits [47:40] of the result.
3844/// \param __b04
3845/// An 8-bit integral value used to initialize bits [39:32] of the result.
3846/// \param __b03
3847/// An 8-bit integral value used to initialize bits [31:24] of the result.
3848/// \param __b02
3849/// An 8-bit integral value used to initialize bits [23:16] of the result.
3850/// \param __b01
3851/// An 8-bit integral value used to initialize bits [15:8] of the result.
3852/// \param __b00
3853/// An 8-bit integral value used to initialize bits [7:0] of the result.
3854/// \returns An initialized 256-bit integer vector.
3855static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3856_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3857 char __b27, char __b26, char __b25, char __b24,
3858 char __b23, char __b22, char __b21, char __b20,
3859 char __b19, char __b18, char __b17, char __b16,
3860 char __b15, char __b14, char __b13, char __b12,
3861 char __b11, char __b10, char __b09, char __b08,
3862 char __b07, char __b06, char __b05, char __b04,
3863 char __b03, char __b02, char __b01, char __b00)
3864{
3865 return __extension__ (__m256i)(__v32qi){
3866 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3867 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3868 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3869 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3870 };
3871}
3872
3873/// Constructs a 256-bit integer vector initialized with the specified
3874/// 64-bit integral values.
3875///
3876/// \headerfile <x86intrin.h>
3877///
3878/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3879/// instruction.
3880///
3881/// \param __a
3882/// A 64-bit integral value used to initialize bits [255:192] of the result.
3883/// \param __b
3884/// A 64-bit integral value used to initialize bits [191:128] of the result.
3885/// \param __c
3886/// A 64-bit integral value used to initialize bits [127:64] of the result.
3887/// \param __d
3888/// A 64-bit integral value used to initialize bits [63:0] of the result.
3889/// \returns An initialized 256-bit integer vector.
3890static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3891_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3892{
3893 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3894}
3895
3896/* Create vectors with elements in reverse order */
3897/// Constructs a 256-bit floating-point vector of [4 x double],
3898/// initialized in reverse order with the specified double-precision
3899/// floating-point values.
3900///
3901/// \headerfile <x86intrin.h>
3902///
3903/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3904/// instruction.
3905///
3906/// \param __a
3907/// A double-precision floating-point value used to initialize bits [63:0]
3908/// of the result.
3909/// \param __b
3910/// A double-precision floating-point value used to initialize bits [127:64]
3911/// of the result.
3912/// \param __c
3913/// A double-precision floating-point value used to initialize bits [191:128]
3914/// of the result.
3915/// \param __d
3916/// A double-precision floating-point value used to initialize bits [255:192]
3917/// of the result.
3918/// \returns An initialized 256-bit floating-point vector of [4 x double].
3919static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3920_mm256_setr_pd(double __a, double __b, double __c, double __d)
3921{
3922 return _mm256_set_pd(__d, __c, __b, __a);
3923}
3924
3925/// Constructs a 256-bit floating-point vector of [8 x float],
3926/// initialized in reverse order with the specified single-precision
3927/// float-point values.
3928///
3929/// \headerfile <x86intrin.h>
3930///
3931/// This intrinsic is a utility function and does not correspond to a specific
3932/// instruction.
3933///
3934/// \param __a
3935/// A single-precision floating-point value used to initialize bits [31:0]
3936/// of the result.
3937/// \param __b
3938/// A single-precision floating-point value used to initialize bits [63:32]
3939/// of the result.
3940/// \param __c
3941/// A single-precision floating-point value used to initialize bits [95:64]
3942/// of the result.
3943/// \param __d
3944/// A single-precision floating-point value used to initialize bits [127:96]
3945/// of the result.
3946/// \param __e
3947/// A single-precision floating-point value used to initialize bits [159:128]
3948/// of the result.
3949/// \param __f
3950/// A single-precision floating-point value used to initialize bits [191:160]
3951/// of the result.
3952/// \param __g
3953/// A single-precision floating-point value used to initialize bits [223:192]
3954/// of the result.
3955/// \param __h
3956/// A single-precision floating-point value used to initialize bits [255:224]
3957/// of the result.
3958/// \returns An initialized 256-bit floating-point vector of [8 x float].
3959static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3960_mm256_setr_ps(float __a, float __b, float __c, float __d,
3961 float __e, float __f, float __g, float __h)
3962{
3963 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3964}
3965
3966/// Constructs a 256-bit integer vector, initialized in reverse order
3967/// with the specified 32-bit integral values.
3968///
3969/// \headerfile <x86intrin.h>
3970///
3971/// This intrinsic is a utility function and does not correspond to a specific
3972/// instruction.
3973///
3974/// \param __i0
3975/// A 32-bit integral value used to initialize bits [31:0] of the result.
3976/// \param __i1
3977/// A 32-bit integral value used to initialize bits [63:32] of the result.
3978/// \param __i2
3979/// A 32-bit integral value used to initialize bits [95:64] of the result.
3980/// \param __i3
3981/// A 32-bit integral value used to initialize bits [127:96] of the result.
3982/// \param __i4
3983/// A 32-bit integral value used to initialize bits [159:128] of the result.
3984/// \param __i5
3985/// A 32-bit integral value used to initialize bits [191:160] of the result.
3986/// \param __i6
3987/// A 32-bit integral value used to initialize bits [223:192] of the result.
3988/// \param __i7
3989/// A 32-bit integral value used to initialize bits [255:224] of the result.
3990/// \returns An initialized 256-bit integer vector.
3991static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3992_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3993 int __i4, int __i5, int __i6, int __i7)
3994{
3995 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
3996}
3997
3998/// Constructs a 256-bit integer vector, initialized in reverse order
3999/// with the specified 16-bit integral values.
4000///
4001/// \headerfile <x86intrin.h>
4002///
4003/// This intrinsic is a utility function and does not correspond to a specific
4004/// instruction.
4005///
4006/// \param __w15
4007/// A 16-bit integral value used to initialize bits [15:0] of the result.
4008/// \param __w14
4009/// A 16-bit integral value used to initialize bits [31:16] of the result.
4010/// \param __w13
4011/// A 16-bit integral value used to initialize bits [47:32] of the result.
4012/// \param __w12
4013/// A 16-bit integral value used to initialize bits [63:48] of the result.
4014/// \param __w11
4015/// A 16-bit integral value used to initialize bits [79:64] of the result.
4016/// \param __w10
4017/// A 16-bit integral value used to initialize bits [95:80] of the result.
4018/// \param __w09
4019/// A 16-bit integral value used to initialize bits [111:96] of the result.
4020/// \param __w08
4021/// A 16-bit integral value used to initialize bits [127:112] of the result.
4022/// \param __w07
4023/// A 16-bit integral value used to initialize bits [143:128] of the result.
4024/// \param __w06
4025/// A 16-bit integral value used to initialize bits [159:144] of the result.
4026/// \param __w05
4027/// A 16-bit integral value used to initialize bits [175:160] of the result.
4028/// \param __w04
4029/// A 16-bit integral value used to initialize bits [191:176] of the result.
4030/// \param __w03
4031/// A 16-bit integral value used to initialize bits [207:192] of the result.
4032/// \param __w02
4033/// A 16-bit integral value used to initialize bits [223:208] of the result.
4034/// \param __w01
4035/// A 16-bit integral value used to initialize bits [239:224] of the result.
4036/// \param __w00
4037/// A 16-bit integral value used to initialize bits [255:240] of the result.
4038/// \returns An initialized 256-bit integer vector.
4039static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4040_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4041 short __w11, short __w10, short __w09, short __w08,
4042 short __w07, short __w06, short __w05, short __w04,
4043 short __w03, short __w02, short __w01, short __w00)
4044{
4045 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4046 __w04, __w05, __w06, __w07,
4047 __w08, __w09, __w10, __w11,
4048 __w12, __w13, __w14, __w15);
4049}
4050
4051/// Constructs a 256-bit integer vector, initialized in reverse order
4052/// with the specified 8-bit integral values.
4053///
4054/// \headerfile <x86intrin.h>
4055///
4056/// This intrinsic is a utility function and does not correspond to a specific
4057/// instruction.
4058///
4059/// \param __b31
4060/// An 8-bit integral value used to initialize bits [7:0] of the result.
4061/// \param __b30
4062/// An 8-bit integral value used to initialize bits [15:8] of the result.
4063/// \param __b29
4064/// An 8-bit integral value used to initialize bits [23:16] of the result.
4065/// \param __b28
4066/// An 8-bit integral value used to initialize bits [31:24] of the result.
4067/// \param __b27
4068/// An 8-bit integral value used to initialize bits [39:32] of the result.
4069/// \param __b26
4070/// An 8-bit integral value used to initialize bits [47:40] of the result.
4071/// \param __b25
4072/// An 8-bit integral value used to initialize bits [55:48] of the result.
4073/// \param __b24
4074/// An 8-bit integral value used to initialize bits [63:56] of the result.
4075/// \param __b23
4076/// An 8-bit integral value used to initialize bits [71:64] of the result.
4077/// \param __b22
4078/// An 8-bit integral value used to initialize bits [79:72] of the result.
4079/// \param __b21
4080/// An 8-bit integral value used to initialize bits [87:80] of the result.
4081/// \param __b20
4082/// An 8-bit integral value used to initialize bits [95:88] of the result.
4083/// \param __b19
4084/// An 8-bit integral value used to initialize bits [103:96] of the result.
4085/// \param __b18
4086/// An 8-bit integral value used to initialize bits [111:104] of the result.
4087/// \param __b17
4088/// An 8-bit integral value used to initialize bits [119:112] of the result.
4089/// \param __b16
4090/// An 8-bit integral value used to initialize bits [127:120] of the result.
4091/// \param __b15
4092/// An 8-bit integral value used to initialize bits [135:128] of the result.
4093/// \param __b14
4094/// An 8-bit integral value used to initialize bits [143:136] of the result.
4095/// \param __b13
4096/// An 8-bit integral value used to initialize bits [151:144] of the result.
4097/// \param __b12
4098/// An 8-bit integral value used to initialize bits [159:152] of the result.
4099/// \param __b11
4100/// An 8-bit integral value used to initialize bits [167:160] of the result.
4101/// \param __b10
4102/// An 8-bit integral value used to initialize bits [175:168] of the result.
4103/// \param __b09
4104/// An 8-bit integral value used to initialize bits [183:176] of the result.
4105/// \param __b08
4106/// An 8-bit integral value used to initialize bits [191:184] of the result.
4107/// \param __b07
4108/// An 8-bit integral value used to initialize bits [199:192] of the result.
4109/// \param __b06
4110/// An 8-bit integral value used to initialize bits [207:200] of the result.
4111/// \param __b05
4112/// An 8-bit integral value used to initialize bits [215:208] of the result.
4113/// \param __b04
4114/// An 8-bit integral value used to initialize bits [223:216] of the result.
4115/// \param __b03
4116/// An 8-bit integral value used to initialize bits [231:224] of the result.
4117/// \param __b02
4118/// An 8-bit integral value used to initialize bits [239:232] of the result.
4119/// \param __b01
4120/// An 8-bit integral value used to initialize bits [247:240] of the result.
4121/// \param __b00
4122/// An 8-bit integral value used to initialize bits [255:248] of the result.
4123/// \returns An initialized 256-bit integer vector.
4124static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4125_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4126 char __b27, char __b26, char __b25, char __b24,
4127 char __b23, char __b22, char __b21, char __b20,
4128 char __b19, char __b18, char __b17, char __b16,
4129 char __b15, char __b14, char __b13, char __b12,
4130 char __b11, char __b10, char __b09, char __b08,
4131 char __b07, char __b06, char __b05, char __b04,
4132 char __b03, char __b02, char __b01, char __b00)
4133{
4134 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4135 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4136 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4137 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4138}
4139
4140/// Constructs a 256-bit integer vector, initialized in reverse order
4141/// with the specified 64-bit integral values.
4142///
4143/// \headerfile <x86intrin.h>
4144///
4145/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4146/// instruction.
4147///
4148/// \param __a
4149/// A 64-bit integral value used to initialize bits [63:0] of the result.
4150/// \param __b
4151/// A 64-bit integral value used to initialize bits [127:64] of the result.
4152/// \param __c
4153/// A 64-bit integral value used to initialize bits [191:128] of the result.
4154/// \param __d
4155/// A 64-bit integral value used to initialize bits [255:192] of the result.
4156/// \returns An initialized 256-bit integer vector.
4157static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4158_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4159{
4160 return _mm256_set_epi64x(__d, __c, __b, __a);
4161}
4162
4163/* Create vectors with repeated elements */
4164/// Constructs a 256-bit floating-point vector of [4 x double], with each
4165/// of the four double-precision floating-point vector elements set to the
4166/// specified double-precision floating-point value.
4167///
4168/// \headerfile <x86intrin.h>
4169///
4170/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4171///
4172/// \param __w
4173/// A double-precision floating-point value used to initialize each vector
4174/// element of the result.
4175/// \returns An initialized 256-bit floating-point vector of [4 x double].
4176static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4178{
4179 return _mm256_set_pd(__w, __w, __w, __w);
4180}
4181
4182/// Constructs a 256-bit floating-point vector of [8 x float], with each
4183/// of the eight single-precision floating-point vector elements set to the
4184/// specified single-precision floating-point value.
4185///
4186/// \headerfile <x86intrin.h>
4187///
4188/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4189/// instruction.
4190///
4191/// \param __w
4192/// A single-precision floating-point value used to initialize each vector
4193/// element of the result.
4194/// \returns An initialized 256-bit floating-point vector of [8 x float].
4195static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4197{
4198 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4199}
4200
4201/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4202/// 32-bit integral vector elements set to the specified 32-bit integral
4203/// value.
4204///
4205/// \headerfile <x86intrin.h>
4206///
4207/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4208/// instruction.
4209///
4210/// \param __i
4211/// A 32-bit integral value used to initialize each vector element of the
4212/// result.
4213/// \returns An initialized 256-bit integer vector of [8 x i32].
4214static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4216{
4217 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4218}
4219
4220/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4221/// 16-bit integral vector elements set to the specified 16-bit integral
4222/// value.
4223///
4224/// \headerfile <x86intrin.h>
4225///
4226/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4227///
4228/// \param __w
4229/// A 16-bit integral value used to initialize each vector element of the
4230/// result.
4231/// \returns An initialized 256-bit integer vector of [16 x i16].
4232static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4234{
4235 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4236 __w, __w, __w, __w, __w, __w, __w, __w);
4237}
4238
4239/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4240/// 8-bit integral vector elements set to the specified 8-bit integral value.
4241///
4242/// \headerfile <x86intrin.h>
4243///
4244/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4245///
4246/// \param __b
4247/// An 8-bit integral value used to initialize each vector element of the
4248/// result.
4249/// \returns An initialized 256-bit integer vector of [32 x i8].
4250static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4252{
4253 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4254 __b, __b, __b, __b, __b, __b, __b, __b,
4255 __b, __b, __b, __b, __b, __b, __b, __b,
4256 __b, __b, __b, __b, __b, __b, __b, __b);
4257}
4258
4259/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4260/// 64-bit integral vector elements set to the specified 64-bit integral
4261/// value.
4262///
4263/// \headerfile <x86intrin.h>
4264///
4265/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4266///
4267/// \param __q
4268/// A 64-bit integral value used to initialize each vector element of the
4269/// result.
4270/// \returns An initialized 256-bit integer vector of [4 x i64].
4271static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4273{
4274 return _mm256_set_epi64x(__q, __q, __q, __q);
4275}
4276
4277/* Create __zeroed vectors */
4278/// Constructs a 256-bit floating-point vector of [4 x double] with all
4279/// vector elements initialized to zero.
4280///
4281/// \headerfile <x86intrin.h>
4282///
4283/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4284///
4285/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4287 return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
4288}
4289
4290/// Constructs a 256-bit floating-point vector of [8 x float] with all
4291/// vector elements initialized to zero.
4292///
4293/// \headerfile <x86intrin.h>
4294///
4295/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4296///
4297/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4299 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4300}
4301
4302/// Constructs a 256-bit integer vector initialized to zero.
4303///
4304/// \headerfile <x86intrin.h>
4305///
4306/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4307///
4308/// \returns A 256-bit integer vector initialized to zero.
4309static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4311 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4312}
4313
4314/* Cast between vector types */
4315/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4316/// floating-point vector of [8 x float].
4317///
4318/// \headerfile <x86intrin.h>
4319///
4320/// This intrinsic has no corresponding instruction.
4321///
4322/// \param __a
4323/// A 256-bit floating-point vector of [4 x double].
4324/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4325/// bitwise pattern as the parameter.
4326static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4328{
4329 return (__m256)__a;
4330}
4331
4332/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4333/// integer vector.
4334///
4335/// \headerfile <x86intrin.h>
4336///
4337/// This intrinsic has no corresponding instruction.
4338///
4339/// \param __a
4340/// A 256-bit floating-point vector of [4 x double].
4341/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4342/// parameter.
4343static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4345{
4346 return (__m256i)__a;
4347}
4348
4349/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4350/// floating-point vector of [4 x double].
4351///
4352/// \headerfile <x86intrin.h>
4353///
4354/// This intrinsic has no corresponding instruction.
4355///
4356/// \param __a
4357/// A 256-bit floating-point vector of [8 x float].
4358/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4359/// bitwise pattern as the parameter.
4360static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4362{
4363 return (__m256d)__a;
4364}
4365
4366/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4367/// integer vector.
4368///
4369/// \headerfile <x86intrin.h>
4370///
4371/// This intrinsic has no corresponding instruction.
4372///
4373/// \param __a
4374/// A 256-bit floating-point vector of [8 x float].
4375/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4376/// parameter.
4377static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4379{
4380 return (__m256i)__a;
4381}
4382
4383/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4384/// of [8 x float].
4385///
4386/// \headerfile <x86intrin.h>
4387///
4388/// This intrinsic has no corresponding instruction.
4389///
4390/// \param __a
4391/// A 256-bit integer vector.
4392/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4393/// bitwise pattern as the parameter.
4394static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4396{
4397 return (__m256)__a;
4398}
4399
4400/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4401/// of [4 x double].
4402///
4403/// \headerfile <x86intrin.h>
4404///
4405/// This intrinsic has no corresponding instruction.
4406///
4407/// \param __a
4408/// A 256-bit integer vector.
4409/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4410/// bitwise pattern as the parameter.
4411static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4413{
4414 return (__m256d)__a;
4415}
4416
4417/// Returns the lower 128 bits of a 256-bit floating-point vector of
4418/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4419///
4420/// \headerfile <x86intrin.h>
4421///
4422/// This intrinsic has no corresponding instruction.
4423///
4424/// \param __a
4425/// A 256-bit floating-point vector of [4 x double].
4426/// \returns A 128-bit floating-point vector of [2 x double] containing the
4427/// lower 128 bits of the parameter.
4428static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4430{
4431 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4432}
4433
4434/// Returns the lower 128 bits of a 256-bit floating-point vector of
4435/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4436///
4437/// \headerfile <x86intrin.h>
4438///
4439/// This intrinsic has no corresponding instruction.
4440///
4441/// \param __a
4442/// A 256-bit floating-point vector of [8 x float].
4443/// \returns A 128-bit floating-point vector of [4 x float] containing the
4444/// lower 128 bits of the parameter.
4445static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4447{
4448 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4449}
4450
4451/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4452///
4453/// \headerfile <x86intrin.h>
4454///
4455/// This intrinsic has no corresponding instruction.
4456///
4457/// \param __a
4458/// A 256-bit integer vector.
4459/// \returns A 128-bit integer vector containing the lower 128 bits of the
4460/// parameter.
4461static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4463{
4464 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4465}
4466
4467/// Constructs a 256-bit floating-point vector of [4 x double] from a
4468/// 128-bit floating-point vector of [2 x double].
4469///
4470/// The lower 128 bits contain the value of the source vector. The contents
4471/// of the upper 128 bits are undefined.
4472///
4473/// \headerfile <x86intrin.h>
4474///
4475/// This intrinsic has no corresponding instruction.
4476///
4477/// \param __a
4478/// A 128-bit vector of [2 x double].
4479/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4480/// contain the value of the parameter. The contents of the upper 128 bits
4481/// are undefined.
4482static __inline __m256d __DEFAULT_FN_ATTRS
4484{
4485 return __builtin_shufflevector(
4486 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4487}
4488
4489/// Constructs a 256-bit floating-point vector of [8 x float] from a
4490/// 128-bit floating-point vector of [4 x float].
4491///
4492/// The lower 128 bits contain the value of the source vector. The contents
4493/// of the upper 128 bits are undefined.
4494///
4495/// \headerfile <x86intrin.h>
4496///
4497/// This intrinsic has no corresponding instruction.
4498///
4499/// \param __a
4500/// A 128-bit vector of [4 x float].
4501/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4502/// contain the value of the parameter. The contents of the upper 128 bits
4503/// are undefined.
4504static __inline __m256 __DEFAULT_FN_ATTRS
4506{
4507 return __builtin_shufflevector((__v4sf)__a,
4508 (__v4sf)__builtin_nondeterministic_value(__a),
4509 0, 1, 2, 3, 4, 5, 6, 7);
4510}
4511
4512/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4513///
4514/// The lower 128 bits contain the value of the source vector. The contents
4515/// of the upper 128 bits are undefined.
4516///
4517/// \headerfile <x86intrin.h>
4518///
4519/// This intrinsic has no corresponding instruction.
4520///
4521/// \param __a
4522/// A 128-bit integer vector.
4523/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4524/// the parameter. The contents of the upper 128 bits are undefined.
4525static __inline __m256i __DEFAULT_FN_ATTRS
4527{
4528 return __builtin_shufflevector(
4529 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4530}
4531
4532/// Constructs a 256-bit floating-point vector of [4 x double] from a
4533/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4534/// contain the value of the source vector. The upper 128 bits are set
4535/// to zero.
4536///
4537/// \headerfile <x86intrin.h>
4538///
4539/// This intrinsic has no corresponding instruction.
4540///
4541/// \param __a
4542/// A 128-bit vector of [2 x double].
4543/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4544/// contain the value of the parameter. The upper 128 bits are set to zero.
4545static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4547 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4548}
4549
4550/// Constructs a 256-bit floating-point vector of [8 x float] from a
4551/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4552/// the value of the source vector. The upper 128 bits are set to zero.
4553///
4554/// \headerfile <x86intrin.h>
4555///
4556/// This intrinsic has no corresponding instruction.
4557///
4558/// \param __a
4559/// A 128-bit vector of [4 x float].
4560/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4561/// contain the value of the parameter. The upper 128 bits are set to zero.
4562static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4564 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4565}
4566
4567/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4568/// The lower 128 bits contain the value of the source vector. The upper
4569/// 128 bits are set to zero.
4570///
4571/// \headerfile <x86intrin.h>
4572///
4573/// This intrinsic has no corresponding instruction.
4574///
4575/// \param __a
4576/// A 128-bit integer vector.
4577/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4578/// the parameter. The upper 128 bits are set to zero.
4579static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4581 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4582}
4583
4584/*
4585 Vector insert.
4586 We use macros rather than inlines because we only want to accept
4587 invocations where the immediate M is a constant expression.
4588*/
4589/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4590/// a 256-bit vector of [8 x float] given in the first parameter, and then
4591/// replacing either the upper or the lower 128 bits with the contents of a
4592/// 128-bit vector of [4 x float] in the second parameter.
4593///
4594/// The immediate integer parameter determines between the upper or the lower
4595/// 128 bits.
4596///
4597/// \headerfile <x86intrin.h>
4598///
4599/// \code
4600/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4601/// \endcode
4602///
4603/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4604///
4605/// \param V1
4606/// A 256-bit vector of [8 x float]. This vector is copied to the result
4607/// first, and then either the upper or the lower 128 bits of the result will
4608/// be replaced by the contents of \a V2.
4609/// \param V2
4610/// A 128-bit vector of [4 x float]. The contents of this parameter are
4611/// written to either the upper or the lower 128 bits of the result depending
4612/// on the value of parameter \a M.
4613/// \param M
4614/// An immediate integer. The least significant bit determines how the values
4615/// from the two parameters are interleaved: \n
4616/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4617/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4618/// result. \n
4619/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4620/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4621/// result.
4622/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4623#define _mm256_insertf128_ps(V1, V2, M) \
4624 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4625 (__v4sf)(__m128)(V2), (int)(M)))
4626
4627/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4628/// a 256-bit vector of [4 x double] given in the first parameter, and then
4629/// replacing either the upper or the lower 128 bits with the contents of a
4630/// 128-bit vector of [2 x double] in the second parameter.
4631///
4632/// The immediate integer parameter determines between the upper or the lower
4633/// 128 bits.
4634///
4635/// \headerfile <x86intrin.h>
4636///
4637/// \code
4638/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4639/// \endcode
4640///
4641/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4642///
4643/// \param V1
4644/// A 256-bit vector of [4 x double]. This vector is copied to the result
4645/// first, and then either the upper or the lower 128 bits of the result will
4646/// be replaced by the contents of \a V2.
4647/// \param V2
4648/// A 128-bit vector of [2 x double]. The contents of this parameter are
4649/// written to either the upper or the lower 128 bits of the result depending
4650/// on the value of parameter \a M.
4651/// \param M
4652/// An immediate integer. The least significant bit determines how the values
4653/// from the two parameters are interleaved: \n
4654/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4655/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4656/// result. \n
4657/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4658/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4659/// result.
4660/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4661#define _mm256_insertf128_pd(V1, V2, M) \
4662 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4663 (__v2df)(__m128d)(V2), (int)(M)))
4664
4665/// Constructs a new 256-bit integer vector by first duplicating a
4666/// 256-bit integer vector given in the first parameter, and then replacing
4667/// either the upper or the lower 128 bits with the contents of a 128-bit
4668/// integer vector in the second parameter.
4669///
4670/// The immediate integer parameter determines between the upper or the lower
4671/// 128 bits.
4672///
4673/// \headerfile <x86intrin.h>
4674///
4675/// \code
4676/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4677/// \endcode
4678///
4679/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4680///
4681/// \param V1
4682/// A 256-bit integer vector. This vector is copied to the result first, and
4683/// then either the upper or the lower 128 bits of the result will be
4684/// replaced by the contents of \a V2.
4685/// \param V2
4686/// A 128-bit integer vector. The contents of this parameter are written to
4687/// either the upper or the lower 128 bits of the result depending on the
4688/// value of parameter \a M.
4689/// \param M
4690/// An immediate integer. The least significant bit determines how the values
4691/// from the two parameters are interleaved: \n
4692/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4693/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4694/// result. \n
4695/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4696/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4697/// result.
4698/// \returns A 256-bit integer vector containing the interleaved values.
4699#define _mm256_insertf128_si256(V1, V2, M) \
4700 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4701 (__v4si)(__m128i)(V2), (int)(M)))
4702
4703/*
4704 Vector extract.
4705 We use macros rather than inlines because we only want to accept
4706 invocations where the immediate M is a constant expression.
4707*/
4708/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4709/// of [8 x float], as determined by the immediate integer parameter, and
4710/// returns the extracted bits as a 128-bit vector of [4 x float].
4711///
4712/// \headerfile <x86intrin.h>
4713///
4714/// \code
4715/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4716/// \endcode
4717///
4718/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4719///
4720/// \param V
4721/// A 256-bit vector of [8 x float].
4722/// \param M
4723/// An immediate integer. The least significant bit determines which bits are
4724/// extracted from the first parameter: \n
4725/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4726/// result. \n
4727/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4728/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4729#define _mm256_extractf128_ps(V, M) \
4730 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4731
4732/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4733/// of [4 x double], as determined by the immediate integer parameter, and
4734/// returns the extracted bits as a 128-bit vector of [2 x double].
4735///
4736/// \headerfile <x86intrin.h>
4737///
4738/// \code
4739/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4740/// \endcode
4741///
4742/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4743///
4744/// \param V
4745/// A 256-bit vector of [4 x double].
4746/// \param M
4747/// An immediate integer. The least significant bit determines which bits are
4748/// extracted from the first parameter: \n
4749/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4750/// result. \n
4751/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4752/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4753#define _mm256_extractf128_pd(V, M) \
4754 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4755
4756/// Extracts either the upper or the lower 128 bits from a 256-bit
4757/// integer vector, as determined by the immediate integer parameter, and
4758/// returns the extracted bits as a 128-bit integer vector.
4759///
4760/// \headerfile <x86intrin.h>
4761///
4762/// \code
4763/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4764/// \endcode
4765///
4766/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4767///
4768/// \param V
4769/// A 256-bit integer vector.
4770/// \param M
4771/// An immediate integer. The least significant bit determines which bits are
4772/// extracted from the first parameter: \n
4773/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4774/// result. \n
4775/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4776/// \returns A 128-bit integer vector containing the extracted bits.
4777#define _mm256_extractf128_si256(V, M) \
4778 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4779
4780/// Constructs a 256-bit floating-point vector of [8 x float] by
4781/// concatenating two 128-bit floating-point vectors of [4 x float].
4782///
4783/// \headerfile <x86intrin.h>
4784///
4785/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4786///
4787/// \param __hi
4788/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4789/// 128 bits of the result.
4790/// \param __lo
4791/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4792/// 128 bits of the result.
4793/// \returns A 256-bit floating-point vector of [8 x float] containing the
4794/// concatenated result.
4795static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4796_mm256_set_m128(__m128 __hi, __m128 __lo) {
4797 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4798}
4799
4800/// Constructs a 256-bit floating-point vector of [4 x double] by
4801/// concatenating two 128-bit floating-point vectors of [2 x double].
4802///
4803/// \headerfile <x86intrin.h>
4804///
4805/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4806///
4807/// \param __hi
4808/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4809/// 128 bits of the result.
4810/// \param __lo
4811/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4812/// 128 bits of the result.
4813/// \returns A 256-bit floating-point vector of [4 x double] containing the
4814/// concatenated result.
4815static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4816_mm256_set_m128d(__m128d __hi, __m128d __lo) {
4817 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4818}
4819
4820/// Constructs a 256-bit integer vector by concatenating two 128-bit
4821/// integer vectors.
4822///
4823/// \headerfile <x86intrin.h>
4824///
4825/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4826///
4827/// \param __hi
4828/// A 128-bit integer vector to be copied to the upper 128 bits of the
4829/// result.
4830/// \param __lo
4831/// A 128-bit integer vector to be copied to the lower 128 bits of the
4832/// result.
4833/// \returns A 256-bit integer vector containing the concatenated result.
4834static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4835_mm256_set_m128i(__m128i __hi, __m128i __lo) {
4836 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4837}
4838
4839/// Constructs a 256-bit floating-point vector of [8 x float] by
4840/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4841/// similar to _mm256_set_m128, but the order of the input parameters is
4842/// swapped.
4843///
4844/// \headerfile <x86intrin.h>
4845///
4846/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4847///
4848/// \param __lo
4849/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4850/// 128 bits of the result.
4851/// \param __hi
4852/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4853/// 128 bits of the result.
4854/// \returns A 256-bit floating-point vector of [8 x float] containing the
4855/// concatenated result.
4856static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4857_mm256_setr_m128(__m128 __lo, __m128 __hi) {
4858 return _mm256_set_m128(__hi, __lo);
4859}
4860
4861/// Constructs a 256-bit floating-point vector of [4 x double] by
4862/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4863/// similar to _mm256_set_m128d, but the order of the input parameters is
4864/// swapped.
4865///
4866/// \headerfile <x86intrin.h>
4867///
4868/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4869///
4870/// \param __lo
4871/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4872/// 128 bits of the result.
4873/// \param __hi
4874/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4875/// 128 bits of the result.
4876/// \returns A 256-bit floating-point vector of [4 x double] containing the
4877/// concatenated result.
4878static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4879_mm256_setr_m128d(__m128d __lo, __m128d __hi) {
4880 return (__m256d)_mm256_set_m128d(__hi, __lo);
4881}
4882
4883/// Constructs a 256-bit integer vector by concatenating two 128-bit
4884/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4885/// the input parameters is swapped.
4886///
4887/// \headerfile <x86intrin.h>
4888///
4889/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4890///
4891/// \param __lo
4892/// A 128-bit integer vector to be copied to the lower 128 bits of the
4893/// result.
4894/// \param __hi
4895/// A 128-bit integer vector to be copied to the upper 128 bits of the
4896/// result.
4897/// \returns A 256-bit integer vector containing the concatenated result.
4898static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4899_mm256_setr_m128i(__m128i __lo, __m128i __hi) {
4900 return (__m256i)_mm256_set_m128i(__hi, __lo);
4901}
4902
4903/* SIMD load ops (unaligned) */
4904/// Loads two 128-bit floating-point vectors of [4 x float] from
4905/// unaligned memory locations and constructs a 256-bit floating-point vector
4906/// of [8 x float] by concatenating the two 128-bit vectors.
4907///
4908/// \headerfile <x86intrin.h>
4909///
4910/// This intrinsic corresponds to load instructions followed by the
4911/// <c> VINSERTF128 </c> instruction.
4912///
4913/// \param __addr_hi
4914/// A pointer to a 128-bit memory location containing 4 consecutive
4915/// single-precision floating-point values. These values are to be copied to
4916/// bits[255:128] of the result. The address of the memory location does not
4917/// have to be aligned.
4918/// \param __addr_lo
4919/// A pointer to a 128-bit memory location containing 4 consecutive
4920/// single-precision floating-point values. These values are to be copied to
4921/// bits[127:0] of the result. The address of the memory location does not
4922/// have to be aligned.
4923/// \returns A 256-bit floating-point vector of [8 x float] containing the
4924/// concatenated result.
4925static __inline __m256 __DEFAULT_FN_ATTRS
4926_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4927{
4928 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4929}
4930
4931/// Loads two 128-bit floating-point vectors of [2 x double] from
4932/// unaligned memory locations and constructs a 256-bit floating-point vector
4933/// of [4 x double] by concatenating the two 128-bit vectors.
4934///
4935/// \headerfile <x86intrin.h>
4936///
4937/// This intrinsic corresponds to load instructions followed by the
4938/// <c> VINSERTF128 </c> instruction.
4939///
4940/// \param __addr_hi
4941/// A pointer to a 128-bit memory location containing two consecutive
4942/// double-precision floating-point values. These values are to be copied to
4943/// bits[255:128] of the result. The address of the memory location does not
4944/// have to be aligned.
4945/// \param __addr_lo
4946/// A pointer to a 128-bit memory location containing two consecutive
4947/// double-precision floating-point values. These values are to be copied to
4948/// bits[127:0] of the result. The address of the memory location does not
4949/// have to be aligned.
4950/// \returns A 256-bit floating-point vector of [4 x double] containing the
4951/// concatenated result.
4952static __inline __m256d __DEFAULT_FN_ATTRS
4953_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4954{
4955 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
4956}
4957
4958/// Loads two 128-bit integer vectors from unaligned memory locations and
4959/// constructs a 256-bit integer vector by concatenating the two 128-bit
4960/// vectors.
4961///
4962/// \headerfile <x86intrin.h>
4963///
4964/// This intrinsic corresponds to load instructions followed by the
4965/// <c> VINSERTF128 </c> instruction.
4966///
4967/// \param __addr_hi
4968/// A pointer to a 128-bit memory location containing a 128-bit integer
4969/// vector. This vector is to be copied to bits[255:128] of the result. The
4970/// address of the memory location does not have to be aligned.
4971/// \param __addr_lo
4972/// A pointer to a 128-bit memory location containing a 128-bit integer
4973/// vector. This vector is to be copied to bits[127:0] of the result. The
4974/// address of the memory location does not have to be aligned.
4975/// \returns A 256-bit integer vector containing the concatenated result.
4976static __inline __m256i __DEFAULT_FN_ATTRS
4977_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
4978{
4979 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
4980}
4981
4982/* SIMD store ops (unaligned) */
4983/// Stores the upper and lower 128 bits of a 256-bit floating-point
4984/// vector of [8 x float] into two different unaligned memory locations.
4985///
4986/// \headerfile <x86intrin.h>
4987///
4988/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4989/// store instructions.
4990///
4991/// \param __addr_hi
4992/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4993/// copied to this memory location. The address of this memory location does
4994/// not have to be aligned.
4995/// \param __addr_lo
4996/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4997/// copied to this memory location. The address of this memory location does
4998/// not have to be aligned.
4999/// \param __a
5000/// A 256-bit floating-point vector of [8 x float].
5001static __inline void __DEFAULT_FN_ATTRS
5002_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
5003{
5004 __m128 __v128;
5005
5006 __v128 = _mm256_castps256_ps128(__a);
5007 _mm_storeu_ps(__addr_lo, __v128);
5008 __v128 = _mm256_extractf128_ps(__a, 1);
5009 _mm_storeu_ps(__addr_hi, __v128);
5010}
5011
5012/// Stores the upper and lower 128 bits of a 256-bit floating-point
5013/// vector of [4 x double] into two different unaligned memory locations.
5014///
5015/// \headerfile <x86intrin.h>
5016///
5017/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5018/// store instructions.
5019///
5020/// \param __addr_hi
5021/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5022/// copied to this memory location. The address of this memory location does
5023/// not have to be aligned.
5024/// \param __addr_lo
5025/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5026/// copied to this memory location. The address of this memory location does
5027/// not have to be aligned.
5028/// \param __a
5029/// A 256-bit floating-point vector of [4 x double].
5030static __inline void __DEFAULT_FN_ATTRS
5031_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5032{
5033 __m128d __v128;
5034
5035 __v128 = _mm256_castpd256_pd128(__a);
5036 _mm_storeu_pd(__addr_lo, __v128);
5037 __v128 = _mm256_extractf128_pd(__a, 1);
5038 _mm_storeu_pd(__addr_hi, __v128);
5039}
5040
5041/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5042/// two different unaligned memory locations.
5043///
5044/// \headerfile <x86intrin.h>
5045///
5046/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5047/// store instructions.
5048///
5049/// \param __addr_hi
5050/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5051/// copied to this memory location. The address of this memory location does
5052/// not have to be aligned.
5053/// \param __addr_lo
5054/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5055/// copied to this memory location. The address of this memory location does
5056/// not have to be aligned.
5057/// \param __a
5058/// A 256-bit integer vector.
5059static __inline void __DEFAULT_FN_ATTRS
5060_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5061{
5062 __m128i __v128;
5063
5064 __v128 = _mm256_castsi256_si128(__a);
5065 _mm_storeu_si128(__addr_lo, __v128);
5066 __v128 = _mm256_extractf128_si256(__a, 1);
5067 _mm_storeu_si128(__addr_hi, __v128);
5068}
5069
5070#undef __DEFAULT_FN_ATTRS
5071#undef __DEFAULT_FN_ATTRS_CONSTEXPR
5072#undef __DEFAULT_FN_ATTRS128
5073#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5074
5075#endif /* __AVXINTRIN_H */
__device__ _Float16
#define __DEFAULT_FN_ATTRS
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3019
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition avxintrin.h:169
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition avxintrin.h:3063
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition avxintrin.h:3269
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition avxintrin.h:829
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition avxintrin.h:3565
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition avxintrin.h:2939
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition avxintrin.h:3083
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4546
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition avxintrin.h:2269
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition avxintrin.h:3233
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition avxintrin.h:3289
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition avxintrin.h:2956
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition avxintrin.h:4926
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:354
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3379
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition avxintrin.h:579
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2590
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition avxintrin.h:3725
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4563
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2674
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition avxintrin.h:116
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:388
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition avxintrin.h:2194
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition avxintrin.h:3614
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition avxintrin.h:304
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition avxintrin.h:973
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition avxintrin.h:3960
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4857
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3404
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3355
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition avxintrin.h:4344
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition avxintrin.h:186
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4899
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition avxintrin.h:3327
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition avxintrin.h:4729
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition avxintrin.h:4777
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition avxintrin.h:3176
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition avxintrin.h:4395
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition avxintrin.h:4327
static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition avxintrin.h:2336
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition avxintrin.h:3654
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition avxintrin.h:2361
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4879
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition avxintrin.h:132
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition avxintrin.h:1396
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition avxintrin.h:3545
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition avxintrin.h:3601
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:761
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:371
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition avxintrin.h:3773
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition avxintrin.h:2179
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition avxintrin.h:600
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition avxintrin.h:3477
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4483
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition avxintrin.h:4177
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2504
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into four signed truncated (rounded toward zero) 32-bit int...
Definition avxintrin.h:2249
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition avxintrin.h:3627
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition avxintrin.h:2229
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition avxintrin.h:2213
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition avxintrin.h:4196
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2478
static __inline __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition avxintrin.h:791
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition avxintrin.h:286
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3119
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition avxintrin.h:2165
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2870
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2617
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition avxintrin.h:2320
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition avxintrin.h:4753
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into eight signed truncated (rounded toward zero) 32-bit integers re...
Definition avxintrin.h:2289
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4505
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3041
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition avxintrin.h:244
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2702
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition avxintrin.h:4298
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition avxintrin.h:4215
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition avxintrin.h:3856
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2645
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition avxintrin.h:4953
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition avxintrin.h:82
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2921
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:672
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition avxintrin.h:5031
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition avxintrin.h:337
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2760
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2787
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:696
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition avxintrin.h:4158
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition avxintrin.h:4272
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition avxintrin.h:3501
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition avxintrin.h:3453
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition avxintrin.h:4125
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition avxintrin.h:4412
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3136
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition avxintrin.h:3693
static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition avxintrin.h:4429
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4580
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition avxintrin.h:98
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition avxintrin.h:4040
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition avxintrin.h:2386
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition avxintrin.h:2408
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition avxintrin.h:5060
static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition avxintrin.h:4446
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:618
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition avxintrin.h:4378
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2815
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:717
static __inline __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition avxintrin.h:883
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:654
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:636
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition avxintrin.h:151
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:740
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition avxintrin.h:1423
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition avxintrin.h:4977
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2532
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2844
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition avxintrin.h:4286
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition avxintrin.h:3891
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3428
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition avxintrin.h:320
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition avxintrin.h:3586
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition avxintrin.h:558
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3192
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition avxintrin.h:3310
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition avxintrin.h:265
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4796
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition avxintrin.h:4361
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition avxintrin.h:4310
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4526
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition avxintrin.h:202
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3156
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:2997
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition avxintrin.h:3992
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2560
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition avxintrin.h:2452
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition avxintrin.h:2431
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition avxintrin.h:4233
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition avxintrin.h:4251
static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition avxintrin.h:4462
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition avxintrin.h:223
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2730
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3213
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2895
static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition avxintrin.h:2305
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4835
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition avxintrin.h:3251
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition avxintrin.h:3920
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition avxintrin.h:5002
double __v4df __attribute__((__vector_size__(32)))
Definition avxintrin.h:17
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition avxintrin.h:3525
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4816
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3103
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition avxintrin.h:540
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition emmintrin.h:1619
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3878
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition emmintrin.h:3456
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition emmintrin.h:1980
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition emmintrin.h:3909
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:2100
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2021
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1863