clang 22.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
55 __min_vector_width__(256)))
56#define __DEFAULT_FN_ATTRS128 \
57 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
58 __min_vector_width__(128)))
59
60#if defined(__cplusplus) && (__cplusplus >= 201103L)
61#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
62#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
63#else
64#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
65#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
66#endif
67
68/* Arithmetic */
69/// Adds two 256-bit vectors of [4 x double].
70///
71/// \headerfile <x86intrin.h>
72///
73/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
74///
75/// \param __a
76/// A 256-bit vector of [4 x double] containing one of the source operands.
77/// \param __b
78/// A 256-bit vector of [4 x double] containing one of the source operands.
79/// \returns A 256-bit vector of [4 x double] containing the sums of both
80/// operands.
81static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
82_mm256_add_pd(__m256d __a, __m256d __b) {
83 return (__m256d)((__v4df)__a+(__v4df)__b);
84}
85
86/// Adds two 256-bit vectors of [8 x float].
87///
88/// \headerfile <x86intrin.h>
89///
90/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
91///
92/// \param __a
93/// A 256-bit vector of [8 x float] containing one of the source operands.
94/// \param __b
95/// A 256-bit vector of [8 x float] containing one of the source operands.
96/// \returns A 256-bit vector of [8 x float] containing the sums of both
97/// operands.
98static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a,
99 __m256 __b) {
100 return (__m256)((__v8sf)__a+(__v8sf)__b);
101}
102
103/// Subtracts two 256-bit vectors of [4 x double].
104///
105/// \headerfile <x86intrin.h>
106///
107/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
108///
109/// \param __a
110/// A 256-bit vector of [4 x double] containing the minuend.
111/// \param __b
112/// A 256-bit vector of [4 x double] containing the subtrahend.
113/// \returns A 256-bit vector of [4 x double] containing the differences between
114/// both operands.
115static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
116_mm256_sub_pd(__m256d __a, __m256d __b) {
117 return (__m256d)((__v4df)__a-(__v4df)__b);
118}
119
120/// Subtracts two 256-bit vectors of [8 x float].
121///
122/// \headerfile <x86intrin.h>
123///
124/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
125///
126/// \param __a
127/// A 256-bit vector of [8 x float] containing the minuend.
128/// \param __b
129/// A 256-bit vector of [8 x float] containing the subtrahend.
130/// \returns A 256-bit vector of [8 x float] containing the differences between
131/// both operands.
132static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a,
133 __m256 __b) {
134 return (__m256)((__v8sf)__a-(__v8sf)__b);
135}
136
137/// Adds the even-indexed values and subtracts the odd-indexed values of
138/// two 256-bit vectors of [4 x double].
139///
140/// \headerfile <x86intrin.h>
141///
142/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
143///
144/// \param __a
145/// A 256-bit vector of [4 x double] containing the left source operand.
146/// \param __b
147/// A 256-bit vector of [4 x double] containing the right source operand.
148/// \returns A 256-bit vector of [4 x double] containing the alternating sums
149/// and differences between both operands.
150static __inline __m256d __DEFAULT_FN_ATTRS
151_mm256_addsub_pd(__m256d __a, __m256d __b)
152{
153 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
154}
155
156/// Adds the even-indexed values and subtracts the odd-indexed values of
157/// two 256-bit vectors of [8 x float].
158///
159/// \headerfile <x86intrin.h>
160///
161/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
162///
163/// \param __a
164/// A 256-bit vector of [8 x float] containing the left source operand.
165/// \param __b
166/// A 256-bit vector of [8 x float] containing the right source operand.
167/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
168/// differences between both operands.
169static __inline __m256 __DEFAULT_FN_ATTRS
170_mm256_addsub_ps(__m256 __a, __m256 __b)
171{
172 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
173}
174
175/// Divides two 256-bit vectors of [4 x double].
176///
177/// \headerfile <x86intrin.h>
178///
179/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
180///
181/// \param __a
182/// A 256-bit vector of [4 x double] containing the dividend.
183/// \param __b
184/// A 256-bit vector of [4 x double] containing the divisor.
185/// \returns A 256-bit vector of [4 x double] containing the quotients of both
186/// operands.
187static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
188_mm256_div_pd(__m256d __a, __m256d __b) {
189 return (__m256d)((__v4df)__a/(__v4df)__b);
190}
191
192/// Divides two 256-bit vectors of [8 x float].
193///
194/// \headerfile <x86intrin.h>
195///
196/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
197///
198/// \param __a
199/// A 256-bit vector of [8 x float] containing the dividend.
200/// \param __b
201/// A 256-bit vector of [8 x float] containing the divisor.
202/// \returns A 256-bit vector of [8 x float] containing the quotients of both
203/// operands.
204static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a,
205 __m256 __b) {
206 return (__m256)((__v8sf)__a/(__v8sf)__b);
207}
208
209/// Compares two 256-bit vectors of [4 x double] and returns the greater
210/// of each pair of values.
211///
212/// If either value in a comparison is NaN, returns the value from \a __b.
213///
214/// \headerfile <x86intrin.h>
215///
216/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
217///
218/// \param __a
219/// A 256-bit vector of [4 x double] containing one of the operands.
220/// \param __b
221/// A 256-bit vector of [4 x double] containing one of the operands.
222/// \returns A 256-bit vector of [4 x double] containing the maximum values
223/// between both operands.
224static __inline __m256d __DEFAULT_FN_ATTRS
225_mm256_max_pd(__m256d __a, __m256d __b)
226{
227 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
228}
229
230/// Compares two 256-bit vectors of [8 x float] and returns the greater
231/// of each pair of values.
232///
233/// If either value in a comparison is NaN, returns the value from \a __b.
234///
235/// \headerfile <x86intrin.h>
236///
237/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
238///
239/// \param __a
240/// A 256-bit vector of [8 x float] containing one of the operands.
241/// \param __b
242/// A 256-bit vector of [8 x float] containing one of the operands.
243/// \returns A 256-bit vector of [8 x float] containing the maximum values
244/// between both operands.
245static __inline __m256 __DEFAULT_FN_ATTRS
246_mm256_max_ps(__m256 __a, __m256 __b)
247{
248 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
249}
250
251/// Compares two 256-bit vectors of [4 x double] and returns the lesser
252/// of each pair of values.
253///
254/// If either value in a comparison is NaN, returns the value from \a __b.
255///
256/// \headerfile <x86intrin.h>
257///
258/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
259///
260/// \param __a
261/// A 256-bit vector of [4 x double] containing one of the operands.
262/// \param __b
263/// A 256-bit vector of [4 x double] containing one of the operands.
264/// \returns A 256-bit vector of [4 x double] containing the minimum values
265/// between both operands.
266static __inline __m256d __DEFAULT_FN_ATTRS
267_mm256_min_pd(__m256d __a, __m256d __b)
268{
269 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
270}
271
272/// Compares two 256-bit vectors of [8 x float] and returns the lesser
273/// of each pair of values.
274///
275/// If either value in a comparison is NaN, returns the value from \a __b.
276///
277/// \headerfile <x86intrin.h>
278///
279/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
280///
281/// \param __a
282/// A 256-bit vector of [8 x float] containing one of the operands.
283/// \param __b
284/// A 256-bit vector of [8 x float] containing one of the operands.
285/// \returns A 256-bit vector of [8 x float] containing the minimum values
286/// between both operands.
287static __inline __m256 __DEFAULT_FN_ATTRS
288_mm256_min_ps(__m256 __a, __m256 __b)
289{
290 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
291}
292
293/// Multiplies two 256-bit vectors of [4 x double].
294///
295/// \headerfile <x86intrin.h>
296///
297/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
298///
299/// \param __a
300/// A 256-bit vector of [4 x double] containing one of the operands.
301/// \param __b
302/// A 256-bit vector of [4 x double] containing one of the operands.
303/// \returns A 256-bit vector of [4 x double] containing the products of both
304/// operands.
305static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
306_mm256_mul_pd(__m256d __a, __m256d __b) {
307 return (__m256d)((__v4df)__a * (__v4df)__b);
308}
309
310/// Multiplies two 256-bit vectors of [8 x float].
311///
312/// \headerfile <x86intrin.h>
313///
314/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
315///
316/// \param __a
317/// A 256-bit vector of [8 x float] containing one of the operands.
318/// \param __b
319/// A 256-bit vector of [8 x float] containing one of the operands.
320/// \returns A 256-bit vector of [8 x float] containing the products of both
321/// operands.
322static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
323 __m256 __b) {
324 return (__m256)((__v8sf)__a * (__v8sf)__b);
325}
326
327/// Calculates the square roots of the values in a 256-bit vector of
328/// [4 x double].
329///
330/// \headerfile <x86intrin.h>
331///
332/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
333///
334/// \param __a
335/// A 256-bit vector of [4 x double].
336/// \returns A 256-bit vector of [4 x double] containing the square roots of the
337/// values in the operand.
338static __inline __m256d __DEFAULT_FN_ATTRS
340{
341 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
342}
343
344/// Calculates the square roots of the values in a 256-bit vector of
345/// [8 x float].
346///
347/// \headerfile <x86intrin.h>
348///
349/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
350///
351/// \param __a
352/// A 256-bit vector of [8 x float].
353/// \returns A 256-bit vector of [8 x float] containing the square roots of the
354/// values in the operand.
355static __inline __m256 __DEFAULT_FN_ATTRS
357{
358 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
359}
360
361/// Calculates the reciprocal square roots of the values in a 256-bit
362/// vector of [8 x float].
363///
364/// \headerfile <x86intrin.h>
365///
366/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
367///
368/// \param __a
369/// A 256-bit vector of [8 x float].
370/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
371/// roots of the values in the operand.
372static __inline __m256 __DEFAULT_FN_ATTRS
374{
375 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
376}
377
378/// Calculates the reciprocals of the values in a 256-bit vector of
379/// [8 x float].
380///
381/// \headerfile <x86intrin.h>
382///
383/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
384///
385/// \param __a
386/// A 256-bit vector of [8 x float].
387/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
388/// values in the operand.
389static __inline __m256 __DEFAULT_FN_ATTRS
391{
392 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
393}
394
395/// Rounds the values in a 256-bit vector of [4 x double] as specified
396/// by the byte operand. The source values are rounded to integer values and
397/// returned as 64-bit double-precision floating-point values.
398///
399/// \headerfile <x86intrin.h>
400///
401/// \code
402/// __m256d _mm256_round_pd(__m256d V, const int M);
403/// \endcode
404///
405/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
406///
407/// \param V
408/// A 256-bit vector of [4 x double].
409/// \param M
410/// An integer value that specifies the rounding operation. \n
411/// Bits [7:4] are reserved. \n
412/// Bit [3] is a precision exception value: \n
413/// 0: A normal PE exception is used. \n
414/// 1: The PE field is not updated. \n
415/// Bit [2] is the rounding control source: \n
416/// 0: Use bits [1:0] of \a M. \n
417/// 1: Use the current MXCSR setting. \n
418/// Bits [1:0] contain the rounding control definition: \n
419/// 00: Nearest. \n
420/// 01: Downward (toward negative infinity). \n
421/// 10: Upward (toward positive infinity). \n
422/// 11: Truncated.
423/// \returns A 256-bit vector of [4 x double] containing the rounded values.
424#define _mm256_round_pd(V, M) \
425 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
426
427/// Rounds the values stored in a 256-bit vector of [8 x float] as
428/// specified by the byte operand. The source values are rounded to integer
429/// values and returned as floating-point values.
430///
431/// \headerfile <x86intrin.h>
432///
433/// \code
434/// __m256 _mm256_round_ps(__m256 V, const int M);
435/// \endcode
436///
437/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
438///
439/// \param V
440/// A 256-bit vector of [8 x float].
441/// \param M
442/// An integer value that specifies the rounding operation. \n
443/// Bits [7:4] are reserved. \n
444/// Bit [3] is a precision exception value: \n
445/// 0: A normal PE exception is used. \n
446/// 1: The PE field is not updated. \n
447/// Bit [2] is the rounding control source: \n
448/// 0: Use bits [1:0] of \a M. \n
449/// 1: Use the current MXCSR setting. \n
450/// Bits [1:0] contain the rounding control definition: \n
451/// 00: Nearest. \n
452/// 01: Downward (toward negative infinity). \n
453/// 10: Upward (toward positive infinity). \n
454/// 11: Truncated.
455/// \returns A 256-bit vector of [8 x float] containing the rounded values.
456#define _mm256_round_ps(V, M) \
457 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
458
459/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
460/// source values are rounded up to integer values and returned as 64-bit
461/// double-precision floating-point values.
462///
463/// \headerfile <x86intrin.h>
464///
465/// \code
466/// __m256d _mm256_ceil_pd(__m256d V);
467/// \endcode
468///
469/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
470///
471/// \param V
472/// A 256-bit vector of [4 x double].
473/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
474#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
475
476/// Rounds down the values stored in a 256-bit vector of [4 x double].
477/// The source values are rounded down to integer values and returned as
478/// 64-bit double-precision floating-point values.
479///
480/// \headerfile <x86intrin.h>
481///
482/// \code
483/// __m256d _mm256_floor_pd(__m256d V);
484/// \endcode
485///
486/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
487///
488/// \param V
489/// A 256-bit vector of [4 x double].
490/// \returns A 256-bit vector of [4 x double] containing the rounded down
491/// values.
492#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
493
494/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
495/// source values are rounded up to integer values and returned as
496/// floating-point values.
497///
498/// \headerfile <x86intrin.h>
499///
500/// \code
501/// __m256 _mm256_ceil_ps(__m256 V);
502/// \endcode
503///
504/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
505///
506/// \param V
507/// A 256-bit vector of [8 x float].
508/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
509#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
510
511/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
512/// source values are rounded down to integer values and returned as
513/// floating-point values.
514///
515/// \headerfile <x86intrin.h>
516///
517/// \code
518/// __m256 _mm256_floor_ps(__m256 V);
519/// \endcode
520///
521/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
522///
523/// \param V
524/// A 256-bit vector of [8 x float].
525/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
526#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
527
528/* Logical */
529/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
530///
531/// \headerfile <x86intrin.h>
532///
533/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
534///
535/// \param __a
536/// A 256-bit vector of [4 x double] containing one of the source operands.
537/// \param __b
538/// A 256-bit vector of [4 x double] containing one of the source operands.
539/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
540/// values between both operands.
541static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
542_mm256_and_pd(__m256d __a, __m256d __b)
543{
544 return (__m256d)((__v4du)__a & (__v4du)__b);
545}
546
547/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
548///
549/// \headerfile <x86intrin.h>
550///
551/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
552///
553/// \param __a
554/// A 256-bit vector of [8 x float] containing one of the source operands.
555/// \param __b
556/// A 256-bit vector of [8 x float] containing one of the source operands.
557/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
558/// values between both operands.
559static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
560_mm256_and_ps(__m256 __a, __m256 __b)
561{
562 return (__m256)((__v8su)__a & (__v8su)__b);
563}
564
565/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
566/// the one's complement of the values contained in the first source operand.
567///
568/// \headerfile <x86intrin.h>
569///
570/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
571///
572/// \param __a
573/// A 256-bit vector of [4 x double] containing the left source operand. The
574/// one's complement of this value is used in the bitwise AND.
575/// \param __b
576/// A 256-bit vector of [4 x double] containing the right source operand.
577/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
578/// values of the second operand and the one's complement of the first
579/// operand.
580static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
581_mm256_andnot_pd(__m256d __a, __m256d __b)
582{
583 return (__m256d)(~(__v4du)__a & (__v4du)__b);
584}
585
586/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
587/// the one's complement of the values contained in the first source operand.
588///
589/// \headerfile <x86intrin.h>
590///
591/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
592///
593/// \param __a
594/// A 256-bit vector of [8 x float] containing the left source operand. The
595/// one's complement of this value is used in the bitwise AND.
596/// \param __b
597/// A 256-bit vector of [8 x float] containing the right source operand.
598/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
599/// values of the second operand and the one's complement of the first
600/// operand.
601static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
602_mm256_andnot_ps(__m256 __a, __m256 __b)
603{
604 return (__m256)(~(__v8su)__a & (__v8su)__b);
605}
606
607/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
608///
609/// \headerfile <x86intrin.h>
610///
611/// This intrinsic corresponds to the <c> VORPD </c> instruction.
612///
613/// \param __a
614/// A 256-bit vector of [4 x double] containing one of the source operands.
615/// \param __b
616/// A 256-bit vector of [4 x double] containing one of the source operands.
617/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
618/// values between both operands.
619static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
620_mm256_or_pd(__m256d __a, __m256d __b)
621{
622 return (__m256d)((__v4du)__a | (__v4du)__b);
623}
624
625/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
626///
627/// \headerfile <x86intrin.h>
628///
629/// This intrinsic corresponds to the <c> VORPS </c> instruction.
630///
631/// \param __a
632/// A 256-bit vector of [8 x float] containing one of the source operands.
633/// \param __b
634/// A 256-bit vector of [8 x float] containing one of the source operands.
635/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
636/// values between both operands.
637static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
638_mm256_or_ps(__m256 __a, __m256 __b)
639{
640 return (__m256)((__v8su)__a | (__v8su)__b);
641}
642
643/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
644///
645/// \headerfile <x86intrin.h>
646///
647/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
648///
649/// \param __a
650/// A 256-bit vector of [4 x double] containing one of the source operands.
651/// \param __b
652/// A 256-bit vector of [4 x double] containing one of the source operands.
653/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
654/// values between both operands.
655static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
656_mm256_xor_pd(__m256d __a, __m256d __b)
657{
658 return (__m256d)((__v4du)__a ^ (__v4du)__b);
659}
660
661/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
662///
663/// \headerfile <x86intrin.h>
664///
665/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
666///
667/// \param __a
668/// A 256-bit vector of [8 x float] containing one of the source operands.
669/// \param __b
670/// A 256-bit vector of [8 x float] containing one of the source operands.
671/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
672/// values between both operands.
673static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
674_mm256_xor_ps(__m256 __a, __m256 __b)
675{
676 return (__m256)((__v8su)__a ^ (__v8su)__b);
677}
678
679/* Horizontal arithmetic */
680/// Horizontally adds the adjacent pairs of values contained in two
681/// 256-bit vectors of [4 x double].
682///
683/// \headerfile <x86intrin.h>
684///
685/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
686///
687/// \param __a
688/// A 256-bit vector of [4 x double] containing one of the source operands.
689/// The horizontal sums of the values are returned in the even-indexed
690/// elements of a vector of [4 x double].
691/// \param __b
692/// A 256-bit vector of [4 x double] containing one of the source operands.
693/// The horizontal sums of the values are returned in the odd-indexed
694/// elements of a vector of [4 x double].
695/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
696/// both operands.
697static __inline __m256d __DEFAULT_FN_ATTRS
698_mm256_hadd_pd(__m256d __a, __m256d __b)
699{
700 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
701}
702
703/// Horizontally adds the adjacent pairs of values contained in two
704/// 256-bit vectors of [8 x float].
705///
706/// \headerfile <x86intrin.h>
707///
708/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
709///
710/// \param __a
711/// A 256-bit vector of [8 x float] containing one of the source operands.
712/// The horizontal sums of the values are returned in the elements with
713/// index 0, 1, 4, 5 of a vector of [8 x float].
714/// \param __b
715/// A 256-bit vector of [8 x float] containing one of the source operands.
716/// The horizontal sums of the values are returned in the elements with
717/// index 2, 3, 6, 7 of a vector of [8 x float].
718/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
719/// both operands.
720static __inline __m256 __DEFAULT_FN_ATTRS
721_mm256_hadd_ps(__m256 __a, __m256 __b)
722{
723 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
724}
725
726/// Horizontally subtracts the adjacent pairs of values contained in two
727/// 256-bit vectors of [4 x double].
728///
729/// \headerfile <x86intrin.h>
730///
731/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
732///
733/// \param __a
734/// A 256-bit vector of [4 x double] containing one of the source operands.
735/// The horizontal differences between the values are returned in the
736/// even-indexed elements of a vector of [4 x double].
737/// \param __b
738/// A 256-bit vector of [4 x double] containing one of the source operands.
739/// The horizontal differences between the values are returned in the
740/// odd-indexed elements of a vector of [4 x double].
741/// \returns A 256-bit vector of [4 x double] containing the horizontal
742/// differences of both operands.
743static __inline __m256d __DEFAULT_FN_ATTRS
744_mm256_hsub_pd(__m256d __a, __m256d __b)
745{
746 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
747}
748
749/// Horizontally subtracts the adjacent pairs of values contained in two
750/// 256-bit vectors of [8 x float].
751///
752/// \headerfile <x86intrin.h>
753///
754/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
755///
756/// \param __a
757/// A 256-bit vector of [8 x float] containing one of the source operands.
758/// The horizontal differences between the values are returned in the
759/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
760/// \param __b
761/// A 256-bit vector of [8 x float] containing one of the source operands.
762/// The horizontal differences between the values are returned in the
763/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
764/// \returns A 256-bit vector of [8 x float] containing the horizontal
765/// differences of both operands.
766static __inline __m256 __DEFAULT_FN_ATTRS
767_mm256_hsub_ps(__m256 __a, __m256 __b)
768{
769 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
770}
771
772/* Vector permutations */
773/// Copies the values in a 128-bit vector of [2 x double] as specified
774/// by the 128-bit integer vector operand.
775///
776/// \headerfile <x86intrin.h>
777///
778/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
779///
780/// \param __a
781/// A 128-bit vector of [2 x double].
782/// \param __c
783/// A 128-bit integer vector operand specifying how the values are to be
784/// copied. \n
785/// Bit [1]: \n
786/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
787/// vector. \n
788/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
789/// returned vector. \n
790/// Bit [65]: \n
791/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
792/// returned vector. \n
793/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
794/// returned vector.
795/// \returns A 128-bit vector of [2 x double] containing the copied values.
796static __inline __m128d __DEFAULT_FN_ATTRS128
797_mm_permutevar_pd(__m128d __a, __m128i __c)
798{
799 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
800}
801
802/// Copies the values in a 256-bit vector of [4 x double] as specified
803/// by the 256-bit integer vector operand.
804///
805/// \headerfile <x86intrin.h>
806///
807/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
808///
809/// \param __a
810/// A 256-bit vector of [4 x double].
811/// \param __c
812/// A 256-bit integer vector operand specifying how the values are to be
813/// copied. \n
814/// Bit [1]: \n
815/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
816/// vector. \n
817/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
818/// returned vector. \n
819/// Bit [65]: \n
820/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
821/// returned vector. \n
822/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
823/// returned vector. \n
824/// Bit [129]: \n
825/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
826/// returned vector. \n
827/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
828/// returned vector. \n
829/// Bit [193]: \n
830/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
831/// returned vector. \n
832/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
833/// returned vector.
834/// \returns A 256-bit vector of [4 x double] containing the copied values.
835static __inline __m256d __DEFAULT_FN_ATTRS
836_mm256_permutevar_pd(__m256d __a, __m256i __c)
837{
838 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
839}
840
841/// Copies the values stored in a 128-bit vector of [4 x float] as
842/// specified by the 128-bit integer vector operand.
843///
844/// \headerfile <x86intrin.h>
845///
846/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
847///
848/// \param __a
849/// A 128-bit vector of [4 x float].
850/// \param __c
851/// A 128-bit integer vector operand specifying how the values are to be
852/// copied. \n
853/// Bits [1:0]: \n
854/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
855/// returned vector. \n
856/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
857/// returned vector. \n
858/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
859/// returned vector. \n
860/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
861/// returned vector. \n
862/// Bits [33:32]: \n
863/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
864/// returned vector. \n
865/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
866/// returned vector. \n
867/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
868/// returned vector. \n
869/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
870/// returned vector. \n
871/// Bits [65:64]: \n
872/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
873/// returned vector. \n
874/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
875/// returned vector. \n
876/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
877/// returned vector. \n
878/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
879/// returned vector. \n
880/// Bits [97:96]: \n
881/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
882/// returned vector. \n
883/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
884/// returned vector. \n
885/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
886/// returned vector. \n
887/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
888/// returned vector.
889/// \returns A 128-bit vector of [4 x float] containing the copied values.
890static __inline __m128 __DEFAULT_FN_ATTRS128
891_mm_permutevar_ps(__m128 __a, __m128i __c)
892{
893 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
894}
895
896/// Copies the values stored in a 256-bit vector of [8 x float] as
897/// specified by the 256-bit integer vector operand.
898///
899/// \headerfile <x86intrin.h>
900///
901/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
902///
903/// \param __a
904/// A 256-bit vector of [8 x float].
905/// \param __c
906/// A 256-bit integer vector operand specifying how the values are to be
907/// copied. \n
908/// Bits [1:0]: \n
909/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
910/// returned vector. \n
911/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
912/// returned vector. \n
913/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
914/// returned vector. \n
915/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
916/// returned vector. \n
917/// Bits [33:32]: \n
918/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
919/// returned vector. \n
920/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
921/// returned vector. \n
922/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
923/// returned vector. \n
924/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
925/// returned vector. \n
926/// Bits [65:64]: \n
927/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
928/// returned vector. \n
929/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
930/// returned vector. \n
931/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
932/// returned vector. \n
933/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
934/// returned vector. \n
935/// Bits [97:96]: \n
936/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
937/// returned vector. \n
938/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
939/// returned vector. \n
940/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
941/// returned vector. \n
942/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
943/// returned vector. \n
944/// Bits [129:128]: \n
945/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
946/// returned vector. \n
947/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
948/// returned vector. \n
949/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
950/// returned vector. \n
951/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
952/// returned vector. \n
953/// Bits [161:160]: \n
954/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
955/// returned vector. \n
956/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
957/// returned vector. \n
958/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
959/// returned vector. \n
960/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
961/// returned vector. \n
962/// Bits [193:192]: \n
963/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
964/// returned vector. \n
965/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
966/// returned vector. \n
967/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
968/// returned vector. \n
969/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
970/// returned vector. \n
971/// Bits [225:224]: \n
972/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
973/// returned vector. \n
974/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
975/// returned vector. \n
976/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
977/// returned vector. \n
978/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
979/// returned vector.
980/// \returns A 256-bit vector of [8 x float] containing the copied values.
981static __inline __m256 __DEFAULT_FN_ATTRS
983{
984 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
985}
986
987/// Copies the values in a 128-bit vector of [2 x double] as specified
988/// by the immediate integer operand.
989///
990/// \headerfile <x86intrin.h>
991///
992/// \code
993/// __m128d _mm_permute_pd(__m128d A, const int C);
994/// \endcode
995///
996/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
997///
998/// \param A
999/// A 128-bit vector of [2 x double].
1000/// \param C
1001/// An immediate integer operand specifying how the values are to be
1002/// copied. \n
1003/// Bit [0]: \n
1004/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1005/// vector. \n
1006/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1007/// returned vector. \n
1008/// Bit [1]: \n
1009/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1010/// returned vector. \n
1011/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1012/// returned vector.
1013/// \returns A 128-bit vector of [2 x double] containing the copied values.
1014#define _mm_permute_pd(A, C) \
1015 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1016
1017/// Copies the values in a 256-bit vector of [4 x double] as specified by
1018/// the immediate integer operand.
1019///
1020/// \headerfile <x86intrin.h>
1021///
1022/// \code
1023/// __m256d _mm256_permute_pd(__m256d A, const int C);
1024/// \endcode
1025///
1026/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1027///
1028/// \param A
1029/// A 256-bit vector of [4 x double].
1030/// \param C
1031/// An immediate integer operand specifying how the values are to be
1032/// copied. \n
1033/// Bit [0]: \n
1034/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1035/// vector. \n
1036/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1037/// returned vector. \n
1038/// Bit [1]: \n
1039/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1040/// returned vector. \n
1041/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1042/// returned vector. \n
1043/// Bit [2]: \n
1044/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1045/// returned vector. \n
1046/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1047/// returned vector. \n
1048/// Bit [3]: \n
1049/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1050/// returned vector. \n
1051/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1052/// returned vector.
1053/// \returns A 256-bit vector of [4 x double] containing the copied values.
1054#define _mm256_permute_pd(A, C) \
1055 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1056
1057/// Copies the values in a 128-bit vector of [4 x float] as specified by
1058/// the immediate integer operand.
1059///
1060/// \headerfile <x86intrin.h>
1061///
1062/// \code
1063/// __m128 _mm_permute_ps(__m128 A, const int C);
1064/// \endcode
1065///
1066/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1067///
1068/// \param A
1069/// A 128-bit vector of [4 x float].
1070/// \param C
1071/// An immediate integer operand specifying how the values are to be
1072/// copied. \n
1073/// Bits [1:0]: \n
1074/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1075/// returned vector. \n
1076/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1077/// returned vector. \n
1078/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1079/// returned vector. \n
1080/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1081/// returned vector. \n
1082/// Bits [3:2]: \n
1083/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1084/// returned vector. \n
1085/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1086/// returned vector. \n
1087/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1088/// returned vector. \n
1089/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1090/// returned vector. \n
1091/// Bits [5:4]: \n
1092/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1093/// returned vector. \n
1094/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1095/// returned vector. \n
1096/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1097/// returned vector. \n
1098/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1099/// returned vector. \n
1100/// Bits [7:6]: \n
1101/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1102/// returned vector. \n
1103/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1104/// returned vector. \n
1105/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1106/// returned vector. \n
1107/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1108/// returned vector.
1109/// \returns A 128-bit vector of [4 x float] containing the copied values.
1110#define _mm_permute_ps(A, C) \
1111 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1112
1113/// Copies the values in a 256-bit vector of [8 x float] as specified by
1114/// the immediate integer operand.
1115///
1116/// \headerfile <x86intrin.h>
1117///
1118/// \code
1119/// __m256 _mm256_permute_ps(__m256 A, const int C);
1120/// \endcode
1121///
1122/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1123///
1124/// \param A
1125/// A 256-bit vector of [8 x float].
1126/// \param C
1127/// An immediate integer operand specifying how the values are to be
1128/// copied. \n
1129/// Bits [1:0]: \n
1130/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1131/// returned vector. \n
1132/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1133/// returned vector. \n
1134/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1135/// returned vector. \n
1136/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1137/// returned vector. \n
1138/// Bits [3:2]: \n
1139/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1140/// returned vector. \n
1141/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1142/// returned vector. \n
1143/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1144/// returned vector. \n
1145/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1146/// returned vector. \n
1147/// Bits [5:4]: \n
1148/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1149/// returned vector. \n
1150/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1151/// returned vector. \n
1152/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1153/// returned vector. \n
1154/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1155/// returned vector. \n
1156/// Bits [7:6]: \n
1157/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1158/// returned vector. \n
1159/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1160/// returned vector. \n
1161/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1162/// returned vector. \n
1163/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1164/// returned vector. \n
1165/// Bits [1:0]: \n
1166/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1167/// returned vector. \n
1168/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1169/// returned vector. \n
1170/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1171/// returned vector. \n
1172/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1173/// returned vector. \n
1174/// Bits [3:2]: \n
1175/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1176/// returned vector. \n
1177/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1178/// returned vector. \n
1179/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1180/// returned vector. \n
1181/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1182/// returned vector. \n
1183/// Bits [5:4]: \n
1184/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1185/// returned vector. \n
1186/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1187/// returned vector. \n
1188/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1189/// returned vector. \n
1190/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1191/// returned vector. \n
1192/// Bits [7:6]: \n
1193/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1194/// returned vector. \n
1195/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1196/// returned vector. \n
1197/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1198/// returned vector. \n
1199/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1200/// returned vector.
1201/// \returns A 256-bit vector of [8 x float] containing the copied values.
1202#define _mm256_permute_ps(A, C) \
1203 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1204
1205/// Permutes 128-bit data values stored in two 256-bit vectors of
1206/// [4 x double], as specified by the immediate integer operand.
1207///
1208/// \headerfile <x86intrin.h>
1209///
1210/// \code
1211/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1212/// \endcode
1213///
1214/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1215///
1216/// \param V1
1217/// A 256-bit vector of [4 x double].
1218/// \param V2
1219/// A 256-bit vector of [4 x double.
1220/// \param M
1221/// An immediate integer operand specifying how the values are to be
1222/// permuted. \n
1223/// Bits [1:0]: \n
1224/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1225/// destination. \n
1226/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1227/// destination. \n
1228/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1229/// destination. \n
1230/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1231/// destination. \n
1232/// Bits [5:4]: \n
1233/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1234/// destination. \n
1235/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1236/// destination. \n
1237/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1238/// destination. \n
1239/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1240/// destination.
1241/// \returns A 256-bit vector of [4 x double] containing the copied values.
1242#define _mm256_permute2f128_pd(V1, V2, M) \
1243 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1244 (__v4df)(__m256d)(V2), (int)(M)))
1245
1246/// Permutes 128-bit data values stored in two 256-bit vectors of
1247/// [8 x float], as specified by the immediate integer operand.
1248///
1249/// \headerfile <x86intrin.h>
1250///
1251/// \code
1252/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1253/// \endcode
1254///
1255/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1256///
1257/// \param V1
1258/// A 256-bit vector of [8 x float].
1259/// \param V2
1260/// A 256-bit vector of [8 x float].
1261/// \param M
1262/// An immediate integer operand specifying how the values are to be
1263/// permuted. \n
1264/// Bits [1:0]: \n
1265/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1266/// destination. \n
1267/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1268/// destination. \n
1269/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1270/// destination. \n
1271/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1272/// destination. \n
1273/// Bits [5:4]: \n
1274/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1275/// destination. \n
1276/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1277/// destination. \n
1278/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1279/// destination. \n
1280/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1281/// destination.
1282/// \returns A 256-bit vector of [8 x float] containing the copied values.
1283#define _mm256_permute2f128_ps(V1, V2, M) \
1284 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1285 (__v8sf)(__m256)(V2), (int)(M)))
1286
1287/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1288/// as specified by the immediate integer operand.
1289///
1290/// \headerfile <x86intrin.h>
1291///
1292/// \code
1293/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1294/// \endcode
1295///
1296/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1297///
1298/// \param V1
1299/// A 256-bit integer vector.
1300/// \param V2
1301/// A 256-bit integer vector.
1302/// \param M
1303/// An immediate integer operand specifying how the values are to be copied.
1304/// Bits [1:0]: \n
1305/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1306/// destination. \n
1307/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1308/// destination. \n
1309/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1310/// destination. \n
1311/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1312/// destination. \n
1313/// Bits [5:4]: \n
1314/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1315/// destination. \n
1316/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1317/// destination. \n
1318/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1319/// destination. \n
1320/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1321/// destination.
1322/// \returns A 256-bit integer vector containing the copied values.
1323#define _mm256_permute2f128_si256(V1, V2, M) \
1324 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1325 (__v8si)(__m256i)(V2), (int)(M)))
1326
1327/* Vector Blend */
1328/// Merges 64-bit double-precision data values stored in either of the
1329/// two 256-bit vectors of [4 x double], as specified by the immediate
1330/// integer operand.
1331///
1332/// \headerfile <x86intrin.h>
1333///
1334/// \code
1335/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1336/// \endcode
1337///
1338/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1339///
1340/// \param V1
1341/// A 256-bit vector of [4 x double].
1342/// \param V2
1343/// A 256-bit vector of [4 x double].
1344/// \param M
1345/// An immediate integer operand, with mask bits [3:0] specifying how the
1346/// values are to be copied. The position of the mask bit corresponds to the
1347/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1348/// element in operand \a V1 is copied to the same position in the
1349/// destination. When a mask bit is 1, the corresponding 64-bit element in
1350/// operand \a V2 is copied to the same position in the destination.
1351/// \returns A 256-bit vector of [4 x double] containing the copied values.
1352#define _mm256_blend_pd(V1, V2, M) \
1353 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1354 (__v4df)(__m256d)(V2), (int)(M)))
1355
1356/// Merges 32-bit single-precision data values stored in either of the
1357/// two 256-bit vectors of [8 x float], as specified by the immediate
1358/// integer operand.
1359///
1360/// \headerfile <x86intrin.h>
1361///
1362/// \code
1363/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1364/// \endcode
1365///
1366/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1367///
1368/// \param V1
1369/// A 256-bit vector of [8 x float].
1370/// \param V2
1371/// A 256-bit vector of [8 x float].
1372/// \param M
1373/// An immediate integer operand, with mask bits [7:0] specifying how the
1374/// values are to be copied. The position of the mask bit corresponds to the
1375/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1376/// element in operand \a V1 is copied to the same position in the
1377/// destination. When a mask bit is 1, the corresponding 32-bit element in
1378/// operand \a V2 is copied to the same position in the destination.
1379/// \returns A 256-bit vector of [8 x float] containing the copied values.
1380#define _mm256_blend_ps(V1, V2, M) \
1381 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1382 (__v8sf)(__m256)(V2), (int)(M)))
1383
1384/// Merges 64-bit double-precision data values stored in either of the
1385/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1386/// operand.
1387///
1388/// \headerfile <x86intrin.h>
1389///
1390/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1391///
1392/// \param __a
1393/// A 256-bit vector of [4 x double].
1394/// \param __b
1395/// A 256-bit vector of [4 x double].
1396/// \param __c
1397/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1398/// how the values are to be copied. The position of the mask bit corresponds
1399/// to the most significant bit of a copied value. When a mask bit is 0, the
1400/// corresponding 64-bit element in operand \a __a is copied to the same
1401/// position in the destination. When a mask bit is 1, the corresponding
1402/// 64-bit element in operand \a __b is copied to the same position in the
1403/// destination.
1404/// \returns A 256-bit vector of [4 x double] containing the copied values.
1405static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
1406_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {
1407 return (__m256d)__builtin_ia32_blendvpd256(
1408 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1409}
1410
1411/// Merges 32-bit single-precision data values stored in either of the
1412/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1413/// operand.
1414///
1415/// \headerfile <x86intrin.h>
1416///
1417/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1418///
1419/// \param __a
1420/// A 256-bit vector of [8 x float].
1421/// \param __b
1422/// A 256-bit vector of [8 x float].
1423/// \param __c
1424/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1425/// and 31 specifying how the values are to be copied. The position of the
1426/// mask bit corresponds to the most significant bit of a copied value. When
1427/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1428/// copied to the same position in the destination. When a mask bit is 1, the
1429/// corresponding 32-bit element in operand \a __b is copied to the same
1430/// position in the destination.
1431/// \returns A 256-bit vector of [8 x float] containing the copied values.
1432static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
1433_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {
1434 return (__m256)__builtin_ia32_blendvps256(
1435 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1436}
1437
1438/* Vector Dot Product */
1439/// Computes two dot products in parallel, using the lower and upper
1440/// halves of two [8 x float] vectors as input to the two computations, and
1441/// returning the two dot products in the lower and upper halves of the
1442/// [8 x float] result.
1443///
1444/// The immediate integer operand controls which input elements will
1445/// contribute to the dot product, and where the final results are returned.
1446/// In general, for each dot product, the four corresponding elements of the
1447/// input vectors are multiplied; the first two and second two products are
1448/// summed, then the two sums are added to form the final result.
1449///
1450/// \headerfile <x86intrin.h>
1451///
1452/// \code
1453/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1454/// \endcode
1455///
1456/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1457///
1458/// \param V1
1459/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1460/// \param V2
1461/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1462/// \param M
1463/// An immediate integer argument. Bits [7:4] determine which elements of
1464/// the input vectors are used, with bit [4] corresponding to the lowest
1465/// element and bit [7] corresponding to the highest element of each [4 x
1466/// float] subvector. If a bit is set, the corresponding elements from the
1467/// two input vectors are used as an input for dot product; otherwise that
1468/// input is treated as zero. Bits [3:0] determine which elements of the
1469/// result will receive a copy of the final dot product, with bit [0]
1470/// corresponding to the lowest element and bit [3] corresponding to the
1471/// highest element of each [4 x float] subvector. If a bit is set, the dot
1472/// product is returned in the corresponding element; otherwise that element
1473/// is set to zero. The bitmask is applied in the same way to each of the
1474/// two parallel dot product computations.
1475/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1476#define _mm256_dp_ps(V1, V2, M) \
1477 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1478 (__v8sf)(__m256)(V2), (M)))
1479
1480/* Vector shuffle */
1481/// Selects 8 float values from the 256-bit operands of [8 x float], as
1482/// specified by the immediate value operand.
1483///
1484/// The four selected elements in each operand are copied to the destination
1485/// according to the bits specified in the immediate operand. The selected
1486/// elements from the first 256-bit operand are copied to bits [63:0] and
1487/// bits [191:128] of the destination, and the selected elements from the
1488/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1489/// the destination. For example, if bits [7:0] of the immediate operand
1490/// contain a value of 0xFF, the 256-bit destination vector would contain the
1491/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1492///
1493/// \headerfile <x86intrin.h>
1494///
1495/// \code
1496/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1497/// \endcode
1498///
1499/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1500///
1501/// \param a
1502/// A 256-bit vector of [8 x float]. The four selected elements in this
1503/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1504/// according to the bits specified in the immediate operand.
1505/// \param b
1506/// A 256-bit vector of [8 x float]. The four selected elements in this
1507/// operand are copied to bits [127:64] and bits [255:192] in the
1508/// destination, according to the bits specified in the immediate operand.
1509/// \param mask
1510/// An immediate value containing an 8-bit value specifying which elements to
1511/// copy from \a a and \a b \n.
1512/// Bits [3:0] specify the values copied from operand \a a. \n
1513/// Bits [7:4] specify the values copied from operand \a b. \n
1514/// The destinations within the 256-bit destination are assigned values as
1515/// follows, according to the bit value assignments described below: \n
1516/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1517/// destination. \n
1518/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1519/// destination. \n
1520/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1521/// destination. \n
1522/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1523/// the destination. \n
1524/// Bit value assignments: \n
1525/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1526/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1527/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1528/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1529/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1530/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1531/// <c>[b6, b4, b2, b0]</c>.
1532/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1533#define _mm256_shuffle_ps(a, b, mask) \
1534 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1535 (__v8sf)(__m256)(b), (int)(mask)))
1536
1537/// Selects four double-precision values from the 256-bit operands of
1538/// [4 x double], as specified by the immediate value operand.
1539///
1540/// The selected elements from the first 256-bit operand are copied to bits
1541/// [63:0] and bits [191:128] in the destination, and the selected elements
1542/// from the second 256-bit operand are copied to bits [127:64] and bits
1543/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1544/// operand contain a value of 0xF, the 256-bit destination vector would
1545/// contain the following values: b[3], a[3], b[1], a[1].
1546///
1547/// \headerfile <x86intrin.h>
1548///
1549/// \code
1550/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1551/// \endcode
1552///
1553/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1554///
1555/// \param a
1556/// A 256-bit vector of [4 x double].
1557/// \param b
1558/// A 256-bit vector of [4 x double].
1559/// \param mask
1560/// An immediate value containing 8-bit values specifying which elements to
1561/// copy from \a a and \a b: \n
1562/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1563/// destination. \n
1564/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1565/// destination. \n
1566/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1567/// destination. \n
1568/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1569/// destination. \n
1570/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1571/// destination. \n
1572/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1573/// destination. \n
1574/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1575/// destination. \n
1576/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1577/// destination.
1578/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1579#define _mm256_shuffle_pd(a, b, mask) \
1580 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1581 (__v4df)(__m256d)(b), (int)(mask)))
1582
1583/* Compare */
1584#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1585#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1586#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1587#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1588#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1589#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1590#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1591#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1592#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1593#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1594#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1595#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1596#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1597#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1598#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1599#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1600#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1601#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1602#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1603#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1604#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1605#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1606#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1607#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1608
1609/* Below intrinsic defined in emmintrin.h can be used for AVX */
1610/// Compares each of the corresponding double-precision values of two
1611/// 128-bit vectors of [2 x double], using the operation specified by the
1612/// immediate integer operand.
1613///
1614/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1615/// If either value in a comparison is NaN, comparisons that are ordered
1616/// return false, and comparisons that are unordered return true.
1617///
1618/// \headerfile <x86intrin.h>
1619///
1620/// \code
1621/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1622/// \endcode
1623///
1624/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1625///
1626/// \param a
1627/// A 128-bit vector of [2 x double].
1628/// \param b
1629/// A 128-bit vector of [2 x double].
1630/// \param c
1631/// An immediate integer operand, with bits [4:0] specifying which comparison
1632/// operation to use: \n
1633/// 0x00: Equal (ordered, non-signaling) \n
1634/// 0x01: Less-than (ordered, signaling) \n
1635/// 0x02: Less-than-or-equal (ordered, signaling) \n
1636/// 0x03: Unordered (non-signaling) \n
1637/// 0x04: Not-equal (unordered, non-signaling) \n
1638/// 0x05: Not-less-than (unordered, signaling) \n
1639/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1640/// 0x07: Ordered (non-signaling) \n
1641/// 0x08: Equal (unordered, non-signaling) \n
1642/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1643/// 0x0A: Not-greater-than (unordered, signaling) \n
1644/// 0x0B: False (ordered, non-signaling) \n
1645/// 0x0C: Not-equal (ordered, non-signaling) \n
1646/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1647/// 0x0E: Greater-than (ordered, signaling) \n
1648/// 0x0F: True (unordered, non-signaling) \n
1649/// 0x10: Equal (ordered, signaling) \n
1650/// 0x11: Less-than (ordered, non-signaling) \n
1651/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1652/// 0x13: Unordered (signaling) \n
1653/// 0x14: Not-equal (unordered, signaling) \n
1654/// 0x15: Not-less-than (unordered, non-signaling) \n
1655/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1656/// 0x17: Ordered (signaling) \n
1657/// 0x18: Equal (unordered, signaling) \n
1658/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1659/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1660/// 0x1B: False (ordered, signaling) \n
1661/// 0x1C: Not-equal (ordered, signaling) \n
1662/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1663/// 0x1E: Greater-than (ordered, non-signaling) \n
1664/// 0x1F: True (unordered, signaling)
1665/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1666/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1667
1668/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1669/// Compares each of the corresponding values of two 128-bit vectors of
1670/// [4 x float], using the operation specified by the immediate integer
1671/// operand.
1672///
1673/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1674/// If either value in a comparison is NaN, comparisons that are ordered
1675/// return false, and comparisons that are unordered return true.
1676///
1677/// \headerfile <x86intrin.h>
1678///
1679/// \code
1680/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1681/// \endcode
1682///
1683/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1684///
1685/// \param a
1686/// A 128-bit vector of [4 x float].
1687/// \param b
1688/// A 128-bit vector of [4 x float].
1689/// \param c
1690/// An immediate integer operand, with bits [4:0] specifying which comparison
1691/// operation to use: \n
1692/// 0x00: Equal (ordered, non-signaling) \n
1693/// 0x01: Less-than (ordered, signaling) \n
1694/// 0x02: Less-than-or-equal (ordered, signaling) \n
1695/// 0x03: Unordered (non-signaling) \n
1696/// 0x04: Not-equal (unordered, non-signaling) \n
1697/// 0x05: Not-less-than (unordered, signaling) \n
1698/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1699/// 0x07: Ordered (non-signaling) \n
1700/// 0x08: Equal (unordered, non-signaling) \n
1701/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1702/// 0x0A: Not-greater-than (unordered, signaling) \n
1703/// 0x0B: False (ordered, non-signaling) \n
1704/// 0x0C: Not-equal (ordered, non-signaling) \n
1705/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1706/// 0x0E: Greater-than (ordered, signaling) \n
1707/// 0x0F: True (unordered, non-signaling) \n
1708/// 0x10: Equal (ordered, signaling) \n
1709/// 0x11: Less-than (ordered, non-signaling) \n
1710/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1711/// 0x13: Unordered (signaling) \n
1712/// 0x14: Not-equal (unordered, signaling) \n
1713/// 0x15: Not-less-than (unordered, non-signaling) \n
1714/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1715/// 0x17: Ordered (signaling) \n
1716/// 0x18: Equal (unordered, signaling) \n
1717/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1718/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1719/// 0x1B: False (ordered, signaling) \n
1720/// 0x1C: Not-equal (ordered, signaling) \n
1721/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1722/// 0x1E: Greater-than (ordered, non-signaling) \n
1723/// 0x1F: True (unordered, signaling)
1724/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1725/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1726
1727/// Compares each of the corresponding double-precision values of two
1728/// 256-bit vectors of [4 x double], using the operation specified by the
1729/// immediate integer operand.
1730///
1731/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1732/// If either value in a comparison is NaN, comparisons that are ordered
1733/// return false, and comparisons that are unordered return true.
1734///
1735/// \headerfile <x86intrin.h>
1736///
1737/// \code
1738/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1739/// \endcode
1740///
1741/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1742///
1743/// \param a
1744/// A 256-bit vector of [4 x double].
1745/// \param b
1746/// A 256-bit vector of [4 x double].
1747/// \param c
1748/// An immediate integer operand, with bits [4:0] specifying which comparison
1749/// operation to use: \n
1750/// 0x00: Equal (ordered, non-signaling) \n
1751/// 0x01: Less-than (ordered, signaling) \n
1752/// 0x02: Less-than-or-equal (ordered, signaling) \n
1753/// 0x03: Unordered (non-signaling) \n
1754/// 0x04: Not-equal (unordered, non-signaling) \n
1755/// 0x05: Not-less-than (unordered, signaling) \n
1756/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1757/// 0x07: Ordered (non-signaling) \n
1758/// 0x08: Equal (unordered, non-signaling) \n
1759/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1760/// 0x0A: Not-greater-than (unordered, signaling) \n
1761/// 0x0B: False (ordered, non-signaling) \n
1762/// 0x0C: Not-equal (ordered, non-signaling) \n
1763/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1764/// 0x0E: Greater-than (ordered, signaling) \n
1765/// 0x0F: True (unordered, non-signaling) \n
1766/// 0x10: Equal (ordered, signaling) \n
1767/// 0x11: Less-than (ordered, non-signaling) \n
1768/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1769/// 0x13: Unordered (signaling) \n
1770/// 0x14: Not-equal (unordered, signaling) \n
1771/// 0x15: Not-less-than (unordered, non-signaling) \n
1772/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1773/// 0x17: Ordered (signaling) \n
1774/// 0x18: Equal (unordered, signaling) \n
1775/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1776/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1777/// 0x1B: False (ordered, signaling) \n
1778/// 0x1C: Not-equal (ordered, signaling) \n
1779/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1780/// 0x1E: Greater-than (ordered, non-signaling) \n
1781/// 0x1F: True (unordered, signaling)
1782/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1783#define _mm256_cmp_pd(a, b, c) \
1784 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1785 (__v4df)(__m256d)(b), (c)))
1786
1787/// Compares each of the corresponding values of two 256-bit vectors of
1788/// [8 x float], using the operation specified by the immediate integer
1789/// operand.
1790///
1791/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1792/// If either value in a comparison is NaN, comparisons that are ordered
1793/// return false, and comparisons that are unordered return true.
1794///
1795/// \headerfile <x86intrin.h>
1796///
1797/// \code
1798/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1799/// \endcode
1800///
1801/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1802///
1803/// \param a
1804/// A 256-bit vector of [8 x float].
1805/// \param b
1806/// A 256-bit vector of [8 x float].
1807/// \param c
1808/// An immediate integer operand, with bits [4:0] specifying which comparison
1809/// operation to use: \n
1810/// 0x00: Equal (ordered, non-signaling) \n
1811/// 0x01: Less-than (ordered, signaling) \n
1812/// 0x02: Less-than-or-equal (ordered, signaling) \n
1813/// 0x03: Unordered (non-signaling) \n
1814/// 0x04: Not-equal (unordered, non-signaling) \n
1815/// 0x05: Not-less-than (unordered, signaling) \n
1816/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1817/// 0x07: Ordered (non-signaling) \n
1818/// 0x08: Equal (unordered, non-signaling) \n
1819/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1820/// 0x0A: Not-greater-than (unordered, signaling) \n
1821/// 0x0B: False (ordered, non-signaling) \n
1822/// 0x0C: Not-equal (ordered, non-signaling) \n
1823/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1824/// 0x0E: Greater-than (ordered, signaling) \n
1825/// 0x0F: True (unordered, non-signaling) \n
1826/// 0x10: Equal (ordered, signaling) \n
1827/// 0x11: Less-than (ordered, non-signaling) \n
1828/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1829/// 0x13: Unordered (signaling) \n
1830/// 0x14: Not-equal (unordered, signaling) \n
1831/// 0x15: Not-less-than (unordered, non-signaling) \n
1832/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1833/// 0x17: Ordered (signaling) \n
1834/// 0x18: Equal (unordered, signaling) \n
1835/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1836/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1837/// 0x1B: False (ordered, signaling) \n
1838/// 0x1C: Not-equal (ordered, signaling) \n
1839/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1840/// 0x1E: Greater-than (ordered, non-signaling) \n
1841/// 0x1F: True (unordered, signaling)
1842/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1843#define _mm256_cmp_ps(a, b, c) \
1844 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1845 (__v8sf)(__m256)(b), (c)))
1846
1847/* Below intrinsic defined in emmintrin.h can be used for AVX */
1848/// Compares each of the corresponding scalar double-precision values of
1849/// two 128-bit vectors of [2 x double], using the operation specified by the
1850/// immediate integer operand.
1851///
1852/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1853/// If either value in a comparison is NaN, comparisons that are ordered
1854/// return false, and comparisons that are unordered return true.
1855///
1856/// \headerfile <x86intrin.h>
1857///
1858/// \code
1859/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1860/// \endcode
1861///
1862/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1863///
1864/// \param a
1865/// A 128-bit vector of [2 x double].
1866/// \param b
1867/// A 128-bit vector of [2 x double].
1868/// \param c
1869/// An immediate integer operand, with bits [4:0] specifying which comparison
1870/// operation to use: \n
1871/// 0x00: Equal (ordered, non-signaling) \n
1872/// 0x01: Less-than (ordered, signaling) \n
1873/// 0x02: Less-than-or-equal (ordered, signaling) \n
1874/// 0x03: Unordered (non-signaling) \n
1875/// 0x04: Not-equal (unordered, non-signaling) \n
1876/// 0x05: Not-less-than (unordered, signaling) \n
1877/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1878/// 0x07: Ordered (non-signaling) \n
1879/// 0x08: Equal (unordered, non-signaling) \n
1880/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1881/// 0x0A: Not-greater-than (unordered, signaling) \n
1882/// 0x0B: False (ordered, non-signaling) \n
1883/// 0x0C: Not-equal (ordered, non-signaling) \n
1884/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1885/// 0x0E: Greater-than (ordered, signaling) \n
1886/// 0x0F: True (unordered, non-signaling) \n
1887/// 0x10: Equal (ordered, signaling) \n
1888/// 0x11: Less-than (ordered, non-signaling) \n
1889/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1890/// 0x13: Unordered (signaling) \n
1891/// 0x14: Not-equal (unordered, signaling) \n
1892/// 0x15: Not-less-than (unordered, non-signaling) \n
1893/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1894/// 0x17: Ordered (signaling) \n
1895/// 0x18: Equal (unordered, signaling) \n
1896/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1897/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1898/// 0x1B: False (ordered, signaling) \n
1899/// 0x1C: Not-equal (ordered, signaling) \n
1900/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1901/// 0x1E: Greater-than (ordered, non-signaling) \n
1902/// 0x1F: True (unordered, signaling)
1903/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1904/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1905
1906/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1907/// Compares each of the corresponding scalar values of two 128-bit
1908/// vectors of [4 x float], using the operation specified by the immediate
1909/// integer operand.
1910///
1911/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1912/// If either value in a comparison is NaN, comparisons that are ordered
1913/// return false, and comparisons that are unordered return true.
1914///
1915/// \headerfile <x86intrin.h>
1916///
1917/// \code
1918/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1919/// \endcode
1920///
1921/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1922///
1923/// \param a
1924/// A 128-bit vector of [4 x float].
1925/// \param b
1926/// A 128-bit vector of [4 x float].
1927/// \param c
1928/// An immediate integer operand, with bits [4:0] specifying which comparison
1929/// operation to use: \n
1930/// 0x00: Equal (ordered, non-signaling) \n
1931/// 0x01: Less-than (ordered, signaling) \n
1932/// 0x02: Less-than-or-equal (ordered, signaling) \n
1933/// 0x03: Unordered (non-signaling) \n
1934/// 0x04: Not-equal (unordered, non-signaling) \n
1935/// 0x05: Not-less-than (unordered, signaling) \n
1936/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1937/// 0x07: Ordered (non-signaling) \n
1938/// 0x08: Equal (unordered, non-signaling) \n
1939/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1940/// 0x0A: Not-greater-than (unordered, signaling) \n
1941/// 0x0B: False (ordered, non-signaling) \n
1942/// 0x0C: Not-equal (ordered, non-signaling) \n
1943/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1944/// 0x0E: Greater-than (ordered, signaling) \n
1945/// 0x0F: True (unordered, non-signaling) \n
1946/// 0x10: Equal (ordered, signaling) \n
1947/// 0x11: Less-than (ordered, non-signaling) \n
1948/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1949/// 0x13: Unordered (signaling) \n
1950/// 0x14: Not-equal (unordered, signaling) \n
1951/// 0x15: Not-less-than (unordered, non-signaling) \n
1952/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1953/// 0x17: Ordered (signaling) \n
1954/// 0x18: Equal (unordered, signaling) \n
1955/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1956/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1957/// 0x1B: False (ordered, signaling) \n
1958/// 0x1C: Not-equal (ordered, signaling) \n
1959/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1960/// 0x1E: Greater-than (ordered, non-signaling) \n
1961/// 0x1F: True (unordered, signaling)
1962/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1963/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1964
1965/// Takes a [8 x i32] vector and returns the vector element value
1966/// indexed by the immediate constant operand.
1967///
1968/// \headerfile <x86intrin.h>
1969///
1970/// \code
1971/// int _mm256_extract_epi32(__m256i X, const int N);
1972/// \endcode
1973///
1974/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1975/// instruction.
1976///
1977/// \param X
1978/// A 256-bit vector of [8 x i32].
1979/// \param N
1980/// An immediate integer operand with bits [2:0] determining which vector
1981/// element is extracted and returned.
1982/// \returns A 32-bit integer containing the extracted 32 bits of extended
1983/// packed data.
1984#define _mm256_extract_epi32(X, N) \
1985 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1986
1987/// Takes a [16 x i16] vector and returns the vector element value
1988/// indexed by the immediate constant operand.
1989///
1990/// \headerfile <x86intrin.h>
1991///
1992/// \code
1993/// int _mm256_extract_epi16(__m256i X, const int N);
1994/// \endcode
1995///
1996/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1997/// instruction.
1998///
1999/// \param X
2000/// A 256-bit integer vector of [16 x i16].
2001/// \param N
2002/// An immediate integer operand with bits [3:0] determining which vector
2003/// element is extracted and returned.
2004/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2005/// packed data.
2006#define _mm256_extract_epi16(X, N) \
2007 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2008 (int)(N)))
2009
2010/// Takes a [32 x i8] vector and returns the vector element value
2011/// indexed by the immediate constant operand.
2012///
2013/// \headerfile <x86intrin.h>
2014///
2015/// \code
2016/// int _mm256_extract_epi8(__m256i X, const int N);
2017/// \endcode
2018///
2019/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2020/// instruction.
2021///
2022/// \param X
2023/// A 256-bit integer vector of [32 x i8].
2024/// \param N
2025/// An immediate integer operand with bits [4:0] determining which vector
2026/// element is extracted and returned.
2027/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2028/// packed data.
2029#define _mm256_extract_epi8(X, N) \
2030 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2031 (int)(N)))
2032
2033#ifdef __x86_64__
2034/// Takes a [4 x i64] vector and returns the vector element value
2035/// indexed by the immediate constant operand.
2036///
2037/// \headerfile <x86intrin.h>
2038///
2039/// \code
2040/// long long _mm256_extract_epi64(__m256i X, const int N);
2041/// \endcode
2042///
2043/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2044/// instruction.
2045///
2046/// \param X
2047/// A 256-bit integer vector of [4 x i64].
2048/// \param N
2049/// An immediate integer operand with bits [1:0] determining which vector
2050/// element is extracted and returned.
2051/// \returns A 64-bit integer containing the extracted 64 bits of extended
2052/// packed data.
2053#define _mm256_extract_epi64(X, N) \
2054 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2055#endif
2056
2057/// Takes a [8 x i32] vector and replaces the vector element value
2058/// indexed by the immediate constant operand by a new value. Returns the
2059/// modified vector.
2060///
2061/// \headerfile <x86intrin.h>
2062///
2063/// \code
2064/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2065/// \endcode
2066///
2067/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2068/// instruction.
2069///
2070/// \param X
2071/// A vector of [8 x i32] to be used by the insert operation.
2072/// \param I
2073/// An integer value. The replacement value for the insert operation.
2074/// \param N
2075/// An immediate integer specifying the index of the vector element to be
2076/// replaced.
2077/// \returns A copy of vector \a X, after replacing its element indexed by
2078/// \a N with \a I.
2079#define _mm256_insert_epi32(X, I, N) \
2080 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2081 (int)(I), (int)(N)))
2082
2083
2084/// Takes a [16 x i16] vector and replaces the vector element value
2085/// indexed by the immediate constant operand with a new value. Returns the
2086/// modified vector.
2087///
2088/// \headerfile <x86intrin.h>
2089///
2090/// \code
2091/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2092/// \endcode
2093///
2094/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2095/// instruction.
2096///
2097/// \param X
2098/// A vector of [16 x i16] to be used by the insert operation.
2099/// \param I
2100/// An i16 integer value. The replacement value for the insert operation.
2101/// \param N
2102/// An immediate integer specifying the index of the vector element to be
2103/// replaced.
2104/// \returns A copy of vector \a X, after replacing its element indexed by
2105/// \a N with \a I.
2106#define _mm256_insert_epi16(X, I, N) \
2107 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2108 (int)(I), (int)(N)))
2109
2110/// Takes a [32 x i8] vector and replaces the vector element value
2111/// indexed by the immediate constant operand with a new value. Returns the
2112/// modified vector.
2113///
2114/// \headerfile <x86intrin.h>
2115///
2116/// \code
2117/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2118/// \endcode
2119///
2120/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2121/// instruction.
2122///
2123/// \param X
2124/// A vector of [32 x i8] to be used by the insert operation.
2125/// \param I
2126/// An i8 integer value. The replacement value for the insert operation.
2127/// \param N
2128/// An immediate integer specifying the index of the vector element to be
2129/// replaced.
2130/// \returns A copy of vector \a X, after replacing its element indexed by
2131/// \a N with \a I.
2132#define _mm256_insert_epi8(X, I, N) \
2133 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2134 (int)(I), (int)(N)))
2135
2136#ifdef __x86_64__
2137/// Takes a [4 x i64] vector and replaces the vector element value
2138/// indexed by the immediate constant operand with a new value. Returns the
2139/// modified vector.
2140///
2141/// \headerfile <x86intrin.h>
2142///
2143/// \code
2144/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2145/// \endcode
2146///
2147/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2148/// instruction.
2149///
2150/// \param X
2151/// A vector of [4 x i64] to be used by the insert operation.
2152/// \param I
2153/// A 64-bit integer value. The replacement value for the insert operation.
2154/// \param N
2155/// An immediate integer specifying the index of the vector element to be
2156/// replaced.
2157/// \returns A copy of vector \a X, after replacing its element indexed by
2158/// \a N with \a I.
2159#define _mm256_insert_epi64(X, I, N) \
2160 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2161 (long long)(I), (int)(N)))
2162#endif
2163
2164/* Conversion */
2165/// Converts a vector of [4 x i32] into a vector of [4 x double].
2166///
2167/// \headerfile <x86intrin.h>
2168///
2169/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2170///
2171/// \param __a
2172/// A 128-bit integer vector of [4 x i32].
2173/// \returns A 256-bit vector of [4 x double] containing the converted values.
2174static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2176 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2177}
2178
2179/// Converts a vector of [8 x i32] into a vector of [8 x float].
2180///
2181/// \headerfile <x86intrin.h>
2182///
2183/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2184///
2185/// \param __a
2186/// A 256-bit integer vector.
2187/// \returns A 256-bit vector of [8 x float] containing the converted values.
2188static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2190 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2191}
2192
2193/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2194/// [4 x float].
2195///
2196/// \headerfile <x86intrin.h>
2197///
2198/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2199///
2200/// \param __a
2201/// A 256-bit vector of [4 x double].
2202/// \returns A 128-bit vector of [4 x float] containing the converted values.
2203static __inline __m128 __DEFAULT_FN_ATTRS
2205{
2206 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2207}
2208
2209/// Converts a vector of [8 x float] into a vector of [8 x i32].
2210///
2211/// If a converted value does not fit in a 32-bit integer, raises a
2212/// floating-point invalid exception. If the exception is masked, returns
2213/// the most negative integer.
2214///
2215/// \headerfile <x86intrin.h>
2216///
2217/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2218///
2219/// \param __a
2220/// A 256-bit vector of [8 x float].
2221/// \returns A 256-bit integer vector containing the converted values.
2222static __inline __m256i __DEFAULT_FN_ATTRS
2224{
2225 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2226}
2227
2228/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2229/// x double].
2230///
2231/// \headerfile <x86intrin.h>
2232///
2233/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2234///
2235/// \param __a
2236/// A 128-bit vector of [4 x float].
2237/// \returns A 256-bit vector of [4 x double] containing the converted values.
2238static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2240 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2241}
2242
2243/// Converts a 256-bit vector of [4 x double] into four signed truncated
2244/// (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2245/// [4 x i32].
2246///
2247/// If a converted value does not fit in a 32-bit integer, raises a
2248/// floating-point invalid exception. If the exception is masked, returns
2249/// the most negative integer.
2250///
2251/// \headerfile <x86intrin.h>
2252///
2253/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2254///
2255/// \param __a
2256/// A 256-bit vector of [4 x double].
2257/// \returns A 128-bit integer vector containing the converted values.
2258static __inline __m128i __DEFAULT_FN_ATTRS
2260{
2261 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2262}
2263
2264/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2265/// [4 x i32].
2266///
2267/// If a converted value does not fit in a 32-bit integer, raises a
2268/// floating-point invalid exception. If the exception is masked, returns
2269/// the most negative integer.
2270///
2271/// \headerfile <x86intrin.h>
2272///
2273/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2274///
2275/// \param __a
2276/// A 256-bit vector of [4 x double].
2277/// \returns A 128-bit integer vector containing the converted values.
2278static __inline __m128i __DEFAULT_FN_ATTRS
2280{
2281 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2282}
2283
2284/// Converts a vector of [8 x float] into eight signed truncated (rounded
2285/// toward zero) 32-bit integers returned in a vector of [8 x i32].
2286///
2287/// If a converted value does not fit in a 32-bit integer, raises a
2288/// floating-point invalid exception. If the exception is masked, returns
2289/// the most negative integer.
2290///
2291/// \headerfile <x86intrin.h>
2292///
2293/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2294///
2295/// \param __a
2296/// A 256-bit vector of [8 x float].
2297/// \returns A 256-bit integer vector containing the converted values.
2298static __inline __m256i __DEFAULT_FN_ATTRS
2300{
2301 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2302}
2303
2304/// Returns the first element of the input vector of [4 x double].
2305///
2306/// \headerfile <x86intrin.h>
2307///
2308/// This intrinsic is a utility function and does not correspond to a specific
2309/// instruction.
2310///
2311/// \param __a
2312/// A 256-bit vector of [4 x double].
2313/// \returns A 64 bit double containing the first element of the input vector.
2314static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR
2316 return __a[0];
2317}
2318
2319/// Returns the first element of the input vector of [8 x i32].
2320///
2321/// \headerfile <x86intrin.h>
2322///
2323/// This intrinsic is a utility function and does not correspond to a specific
2324/// instruction.
2325///
2326/// \param __a
2327/// A 256-bit vector of [8 x i32].
2328/// \returns A 32 bit integer containing the first element of the input vector.
2329static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2331 __v8si __b = (__v8si)__a;
2332 return __b[0];
2333}
2334
2335/// Returns the first element of the input vector of [8 x float].
2336///
2337/// \headerfile <x86intrin.h>
2338///
2339/// This intrinsic is a utility function and does not correspond to a specific
2340/// instruction.
2341///
2342/// \param __a
2343/// A 256-bit vector of [8 x float].
2344/// \returns A 32 bit float containing the first element of the input vector.
2345static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR
2347 return __a[0];
2348}
2349
2350/* Vector replicate */
2351/// Moves and duplicates odd-indexed values from a 256-bit vector of
2352/// [8 x float] to float values in a 256-bit vector of [8 x float].
2353///
2354/// \headerfile <x86intrin.h>
2355///
2356/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2357///
2358/// \param __a
2359/// A 256-bit vector of [8 x float]. \n
2360/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2361/// the return value. \n
2362/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2363/// the return value. \n
2364/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2365/// return value. \n
2366/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2367/// return value.
2368/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2369/// values.
2370static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2372{
2373 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2374}
2375
2376/// Moves and duplicates even-indexed values from a 256-bit vector of
2377/// [8 x float] to float values in a 256-bit vector of [8 x float].
2378///
2379/// \headerfile <x86intrin.h>
2380///
2381/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2382///
2383/// \param __a
2384/// A 256-bit vector of [8 x float]. \n
2385/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2386/// the return value. \n
2387/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2388/// the return value. \n
2389/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2390/// return value. \n
2391/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2392/// return value.
2393/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2394/// values.
2395static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2397{
2398 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2399}
2400
2401/// Moves and duplicates double-precision floating point values from a
2402/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2403/// vector of [4 x double].
2404///
2405/// \headerfile <x86intrin.h>
2406///
2407/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2408///
2409/// \param __a
2410/// A 256-bit vector of [4 x double]. \n
2411/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2412/// return value. \n
2413/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2414/// the return value.
2415/// \returns A 256-bit vector of [4 x double] containing the moved and
2416/// duplicated values.
2417static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2419{
2420 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2421}
2422
2423/* Unpack and Interleave */
2424/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2425/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2426///
2427/// \headerfile <x86intrin.h>
2428///
2429/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2430///
2431/// \param __a
2432/// A 256-bit floating-point vector of [4 x double]. \n
2433/// Bits [127:64] are written to bits [63:0] of the return value. \n
2434/// Bits [255:192] are written to bits [191:128] of the return value. \n
2435/// \param __b
2436/// A 256-bit floating-point vector of [4 x double]. \n
2437/// Bits [127:64] are written to bits [127:64] of the return value. \n
2438/// Bits [255:192] are written to bits [255:192] of the return value. \n
2439/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2440static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2441_mm256_unpackhi_pd(__m256d __a, __m256d __b) {
2442 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2443}
2444
2445/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2446/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2447///
2448/// \headerfile <x86intrin.h>
2449///
2450/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2451///
2452/// \param __a
2453/// A 256-bit floating-point vector of [4 x double]. \n
2454/// Bits [63:0] are written to bits [63:0] of the return value. \n
2455/// Bits [191:128] are written to bits [191:128] of the return value.
2456/// \param __b
2457/// A 256-bit floating-point vector of [4 x double]. \n
2458/// Bits [63:0] are written to bits [127:64] of the return value. \n
2459/// Bits [191:128] are written to bits [255:192] of the return value. \n
2460/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2461static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2462_mm256_unpacklo_pd(__m256d __a, __m256d __b) {
2463 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2464}
2465
2466/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2467/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2468/// vector of [8 x float].
2469///
2470/// \headerfile <x86intrin.h>
2471///
2472/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2473///
2474/// \param __a
2475/// A 256-bit vector of [8 x float]. \n
2476/// Bits [95:64] are written to bits [31:0] of the return value. \n
2477/// Bits [127:96] are written to bits [95:64] of the return value. \n
2478/// Bits [223:192] are written to bits [159:128] of the return value. \n
2479/// Bits [255:224] are written to bits [223:192] of the return value.
2480/// \param __b
2481/// A 256-bit vector of [8 x float]. \n
2482/// Bits [95:64] are written to bits [63:32] of the return value. \n
2483/// Bits [127:96] are written to bits [127:96] of the return value. \n
2484/// Bits [223:192] are written to bits [191:160] of the return value. \n
2485/// Bits [255:224] are written to bits [255:224] of the return value.
2486/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2487static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2488_mm256_unpackhi_ps(__m256 __a, __m256 __b) {
2489 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2490}
2491
2492/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2493/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2494/// vector of [8 x float].
2495///
2496/// \headerfile <x86intrin.h>
2497///
2498/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2499///
2500/// \param __a
2501/// A 256-bit vector of [8 x float]. \n
2502/// Bits [31:0] are written to bits [31:0] of the return value. \n
2503/// Bits [63:32] are written to bits [95:64] of the return value. \n
2504/// Bits [159:128] are written to bits [159:128] of the return value. \n
2505/// Bits [191:160] are written to bits [223:192] of the return value.
2506/// \param __b
2507/// A 256-bit vector of [8 x float]. \n
2508/// Bits [31:0] are written to bits [63:32] of the return value. \n
2509/// Bits [63:32] are written to bits [127:96] of the return value. \n
2510/// Bits [159:128] are written to bits [191:160] of the return value. \n
2511/// Bits [191:160] are written to bits [255:224] of the return value.
2512/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2513static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2514_mm256_unpacklo_ps(__m256 __a, __m256 __b) {
2515 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2516}
2517
2518/* Bit Test */
2519/// Given two 128-bit floating-point vectors of [2 x double], perform an
2520/// element-by-element comparison of the double-precision element in the
2521/// first source vector and the corresponding element in the second source
2522/// vector.
2523///
2524/// The EFLAGS register is updated as follows: \n
2525/// If there is at least one pair of double-precision elements where the
2526/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2527/// ZF flag is set to 1. \n
2528/// If there is at least one pair of double-precision elements where the
2529/// sign-bit of the first element is 0 and the sign-bit of the second element
2530/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2531/// This intrinsic returns the value of the ZF flag.
2532///
2533/// \headerfile <x86intrin.h>
2534///
2535/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2536///
2537/// \param __a
2538/// A 128-bit vector of [2 x double].
2539/// \param __b
2540/// A 128-bit vector of [2 x double].
2541/// \returns the ZF flag in the EFLAGS register.
2542static __inline int __DEFAULT_FN_ATTRS128
2543_mm_testz_pd(__m128d __a, __m128d __b)
2544{
2545 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2546}
2547
2548/// Given two 128-bit floating-point vectors of [2 x double], perform an
2549/// element-by-element comparison of the double-precision element in the
2550/// first source vector and the corresponding element in the second source
2551/// vector.
2552///
2553/// The EFLAGS register is updated as follows: \n
2554/// If there is at least one pair of double-precision elements where the
2555/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2556/// ZF flag is set to 1. \n
2557/// If there is at least one pair of double-precision elements where the
2558/// sign-bit of the first element is 0 and the sign-bit of the second element
2559/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2560/// This intrinsic returns the value of the CF flag.
2561///
2562/// \headerfile <x86intrin.h>
2563///
2564/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2565///
2566/// \param __a
2567/// A 128-bit vector of [2 x double].
2568/// \param __b
2569/// A 128-bit vector of [2 x double].
2570/// \returns the CF flag in the EFLAGS register.
2571static __inline int __DEFAULT_FN_ATTRS128
2572_mm_testc_pd(__m128d __a, __m128d __b)
2573{
2574 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2575}
2576
2577/// Given two 128-bit floating-point vectors of [2 x double], perform an
2578/// element-by-element comparison of the double-precision element in the
2579/// first source vector and the corresponding element in the second source
2580/// vector.
2581///
2582/// The EFLAGS register is updated as follows: \n
2583/// If there is at least one pair of double-precision elements where the
2584/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2585/// ZF flag is set to 1. \n
2586/// If there is at least one pair of double-precision elements where the
2587/// sign-bit of the first element is 0 and the sign-bit of the second element
2588/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2589/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2590/// otherwise it returns 0.
2591///
2592/// \headerfile <x86intrin.h>
2593///
2594/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2595///
2596/// \param __a
2597/// A 128-bit vector of [2 x double].
2598/// \param __b
2599/// A 128-bit vector of [2 x double].
2600/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2601static __inline int __DEFAULT_FN_ATTRS128
2602_mm_testnzc_pd(__m128d __a, __m128d __b)
2603{
2604 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2605}
2606
2607/// Given two 128-bit floating-point vectors of [4 x float], perform an
2608/// element-by-element comparison of the single-precision element in the
2609/// first source vector and the corresponding element in the second source
2610/// vector.
2611///
2612/// The EFLAGS register is updated as follows: \n
2613/// If there is at least one pair of single-precision elements where the
2614/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2615/// ZF flag is set to 1. \n
2616/// If there is at least one pair of single-precision elements where the
2617/// sign-bit of the first element is 0 and the sign-bit of the second element
2618/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2619/// This intrinsic returns the value of the ZF flag.
2620///
2621/// \headerfile <x86intrin.h>
2622///
2623/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2624///
2625/// \param __a
2626/// A 128-bit vector of [4 x float].
2627/// \param __b
2628/// A 128-bit vector of [4 x float].
2629/// \returns the ZF flag.
2630static __inline int __DEFAULT_FN_ATTRS128
2631_mm_testz_ps(__m128 __a, __m128 __b)
2632{
2633 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2634}
2635
2636/// Given two 128-bit floating-point vectors of [4 x float], perform an
2637/// element-by-element comparison of the single-precision element in the
2638/// first source vector and the corresponding element in the second source
2639/// vector.
2640///
2641/// The EFLAGS register is updated as follows: \n
2642/// If there is at least one pair of single-precision elements where the
2643/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2644/// ZF flag is set to 1. \n
2645/// If there is at least one pair of single-precision elements where the
2646/// sign-bit of the first element is 0 and the sign-bit of the second element
2647/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2648/// This intrinsic returns the value of the CF flag.
2649///
2650/// \headerfile <x86intrin.h>
2651///
2652/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2653///
2654/// \param __a
2655/// A 128-bit vector of [4 x float].
2656/// \param __b
2657/// A 128-bit vector of [4 x float].
2658/// \returns the CF flag.
2659static __inline int __DEFAULT_FN_ATTRS128
2660_mm_testc_ps(__m128 __a, __m128 __b)
2661{
2662 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2663}
2664
2665/// Given two 128-bit floating-point vectors of [4 x float], perform an
2666/// element-by-element comparison of the single-precision element in the
2667/// first source vector and the corresponding element in the second source
2668/// vector.
2669///
2670/// The EFLAGS register is updated as follows: \n
2671/// If there is at least one pair of single-precision elements where the
2672/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2673/// ZF flag is set to 1. \n
2674/// If there is at least one pair of single-precision elements where the
2675/// sign-bit of the first element is 0 and the sign-bit of the second element
2676/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2677/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2678/// otherwise it returns 0.
2679///
2680/// \headerfile <x86intrin.h>
2681///
2682/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2683///
2684/// \param __a
2685/// A 128-bit vector of [4 x float].
2686/// \param __b
2687/// A 128-bit vector of [4 x float].
2688/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2689static __inline int __DEFAULT_FN_ATTRS128
2690_mm_testnzc_ps(__m128 __a, __m128 __b)
2691{
2692 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2693}
2694
2695/// Given two 256-bit floating-point vectors of [4 x double], perform an
2696/// element-by-element comparison of the double-precision elements in the
2697/// first source vector and the corresponding elements in the second source
2698/// vector.
2699///
2700/// The EFLAGS register is updated as follows: \n
2701/// If there is at least one pair of double-precision elements where the
2702/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2703/// ZF flag is set to 1. \n
2704/// If there is at least one pair of double-precision elements where the
2705/// sign-bit of the first element is 0 and the sign-bit of the second element
2706/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2707/// This intrinsic returns the value of the ZF flag.
2708///
2709/// \headerfile <x86intrin.h>
2710///
2711/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2712///
2713/// \param __a
2714/// A 256-bit vector of [4 x double].
2715/// \param __b
2716/// A 256-bit vector of [4 x double].
2717/// \returns the ZF flag.
2718static __inline int __DEFAULT_FN_ATTRS
2719_mm256_testz_pd(__m256d __a, __m256d __b)
2720{
2721 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2722}
2723
2724/// Given two 256-bit floating-point vectors of [4 x double], perform an
2725/// element-by-element comparison of the double-precision elements in the
2726/// first source vector and the corresponding elements in the second source
2727/// vector.
2728///
2729/// The EFLAGS register is updated as follows: \n
2730/// If there is at least one pair of double-precision elements where the
2731/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2732/// ZF flag is set to 1. \n
2733/// If there is at least one pair of double-precision elements where the
2734/// sign-bit of the first element is 0 and the sign-bit of the second element
2735/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2736/// This intrinsic returns the value of the CF flag.
2737///
2738/// \headerfile <x86intrin.h>
2739///
2740/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2741///
2742/// \param __a
2743/// A 256-bit vector of [4 x double].
2744/// \param __b
2745/// A 256-bit vector of [4 x double].
2746/// \returns the CF flag.
2747static __inline int __DEFAULT_FN_ATTRS
2748_mm256_testc_pd(__m256d __a, __m256d __b)
2749{
2750 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2751}
2752
2753/// Given two 256-bit floating-point vectors of [4 x double], perform an
2754/// element-by-element comparison of the double-precision elements in the
2755/// first source vector and the corresponding elements in the second source
2756/// vector.
2757///
2758/// The EFLAGS register is updated as follows: \n
2759/// If there is at least one pair of double-precision elements where the
2760/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2761/// ZF flag is set to 1. \n
2762/// If there is at least one pair of double-precision elements where the
2763/// sign-bit of the first element is 0 and the sign-bit of the second element
2764/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2765/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2766/// otherwise it returns 0.
2767///
2768/// \headerfile <x86intrin.h>
2769///
2770/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2771///
2772/// \param __a
2773/// A 256-bit vector of [4 x double].
2774/// \param __b
2775/// A 256-bit vector of [4 x double].
2776/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2777static __inline int __DEFAULT_FN_ATTRS
2778_mm256_testnzc_pd(__m256d __a, __m256d __b)
2779{
2780 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2781}
2782
2783/// Given two 256-bit floating-point vectors of [8 x float], perform an
2784/// element-by-element comparison of the single-precision element in the
2785/// first source vector and the corresponding element in the second source
2786/// vector.
2787///
2788/// The EFLAGS register is updated as follows: \n
2789/// If there is at least one pair of single-precision elements where the
2790/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2791/// ZF flag is set to 1. \n
2792/// If there is at least one pair of single-precision elements where the
2793/// sign-bit of the first element is 0 and the sign-bit of the second element
2794/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2795/// This intrinsic returns the value of the ZF flag.
2796///
2797/// \headerfile <x86intrin.h>
2798///
2799/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2800///
2801/// \param __a
2802/// A 256-bit vector of [8 x float].
2803/// \param __b
2804/// A 256-bit vector of [8 x float].
2805/// \returns the ZF flag.
2806static __inline int __DEFAULT_FN_ATTRS
2807_mm256_testz_ps(__m256 __a, __m256 __b)
2808{
2809 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2810}
2811
2812/// Given two 256-bit floating-point vectors of [8 x float], perform an
2813/// element-by-element comparison of the single-precision element in the
2814/// first source vector and the corresponding element in the second source
2815/// vector.
2816///
2817/// The EFLAGS register is updated as follows: \n
2818/// If there is at least one pair of single-precision elements where the
2819/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2820/// ZF flag is set to 1. \n
2821/// If there is at least one pair of single-precision elements where the
2822/// sign-bit of the first element is 0 and the sign-bit of the second element
2823/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2824/// This intrinsic returns the value of the CF flag.
2825///
2826/// \headerfile <x86intrin.h>
2827///
2828/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2829///
2830/// \param __a
2831/// A 256-bit vector of [8 x float].
2832/// \param __b
2833/// A 256-bit vector of [8 x float].
2834/// \returns the CF flag.
2835static __inline int __DEFAULT_FN_ATTRS
2836_mm256_testc_ps(__m256 __a, __m256 __b)
2837{
2838 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2839}
2840
2841/// Given two 256-bit floating-point vectors of [8 x float], perform an
2842/// element-by-element comparison of the single-precision elements in the
2843/// first source vector and the corresponding elements in the second source
2844/// vector.
2845///
2846/// The EFLAGS register is updated as follows: \n
2847/// If there is at least one pair of single-precision elements where the
2848/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2849/// ZF flag is set to 1. \n
2850/// If there is at least one pair of single-precision elements where the
2851/// sign-bit of the first element is 0 and the sign-bit of the second element
2852/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2853/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2854/// otherwise it returns 0.
2855///
2856/// \headerfile <x86intrin.h>
2857///
2858/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2859///
2860/// \param __a
2861/// A 256-bit vector of [8 x float].
2862/// \param __b
2863/// A 256-bit vector of [8 x float].
2864/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2865static __inline int __DEFAULT_FN_ATTRS
2867{
2868 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2869}
2870
2871/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2872/// of the two source vectors.
2873///
2874/// The EFLAGS register is updated as follows: \n
2875/// If there is at least one pair of bits where both bits are 1, the ZF flag
2876/// is set to 0. Otherwise the ZF flag is set to 1. \n
2877/// If there is at least one pair of bits where the bit from the first source
2878/// vector is 0 and the bit from the second source vector is 1, the CF flag
2879/// is set to 0. Otherwise the CF flag is set to 1. \n
2880/// This intrinsic returns the value of the ZF flag.
2881///
2882/// \headerfile <x86intrin.h>
2883///
2884/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2885///
2886/// \param __a
2887/// A 256-bit integer vector.
2888/// \param __b
2889/// A 256-bit integer vector.
2890/// \returns the ZF flag.
2891static __inline int __DEFAULT_FN_ATTRS
2892_mm256_testz_si256(__m256i __a, __m256i __b)
2893{
2894 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2895}
2896
2897/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2898/// of the two source vectors.
2899///
2900/// The EFLAGS register is updated as follows: \n
2901/// If there is at least one pair of bits where both bits are 1, the ZF flag
2902/// is set to 0. Otherwise the ZF flag is set to 1. \n
2903/// If there is at least one pair of bits where the bit from the first source
2904/// vector is 0 and the bit from the second source vector is 1, the CF flag
2905/// is set to 0. Otherwise the CF flag is set to 1. \n
2906/// This intrinsic returns the value of the CF flag.
2907///
2908/// \headerfile <x86intrin.h>
2909///
2910/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2911///
2912/// \param __a
2913/// A 256-bit integer vector.
2914/// \param __b
2915/// A 256-bit integer vector.
2916/// \returns the CF flag.
2917static __inline int __DEFAULT_FN_ATTRS
2918_mm256_testc_si256(__m256i __a, __m256i __b)
2919{
2920 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2921}
2922
2923/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2924/// of the two source vectors.
2925///
2926/// The EFLAGS register is updated as follows: \n
2927/// If there is at least one pair of bits where both bits are 1, the ZF flag
2928/// is set to 0. Otherwise the ZF flag is set to 1. \n
2929/// If there is at least one pair of bits where the bit from the first source
2930/// vector is 0 and the bit from the second source vector is 1, the CF flag
2931/// is set to 0. Otherwise the CF flag is set to 1. \n
2932/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2933/// otherwise it returns 0.
2934///
2935/// \headerfile <x86intrin.h>
2936///
2937/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2938///
2939/// \param __a
2940/// A 256-bit integer vector.
2941/// \param __b
2942/// A 256-bit integer vector.
2943/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2944static __inline int __DEFAULT_FN_ATTRS
2946{
2947 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2948}
2949
2950/* Vector extract sign mask */
2951/// Extracts the sign bits of double-precision floating point elements
2952/// in a 256-bit vector of [4 x double] and writes them to the lower order
2953/// bits of the return value.
2954///
2955/// \headerfile <x86intrin.h>
2956///
2957/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2958///
2959/// \param __a
2960/// A 256-bit vector of [4 x double] containing the double-precision
2961/// floating point values with sign bits to be extracted.
2962/// \returns The sign bits from the operand, written to bits [3:0].
2963static __inline int __DEFAULT_FN_ATTRS
2965{
2966 return __builtin_ia32_movmskpd256((__v4df)__a);
2967}
2968
2969/// Extracts the sign bits of single-precision floating point elements
2970/// in a 256-bit vector of [8 x float] and writes them to the lower order
2971/// bits of the return value.
2972///
2973/// \headerfile <x86intrin.h>
2974///
2975/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2976///
2977/// \param __a
2978/// A 256-bit vector of [8 x float] containing the single-precision floating
2979/// point values with sign bits to be extracted.
2980/// \returns The sign bits from the operand, written to bits [7:0].
2981static __inline int __DEFAULT_FN_ATTRS
2983{
2984 return __builtin_ia32_movmskps256((__v8sf)__a);
2985}
2986
2987/* Vector __zero */
2988/// Zeroes the contents of all XMM or YMM registers.
2989///
2990/// \headerfile <x86intrin.h>
2991///
2992/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2993static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2994_mm256_zeroall(void)
2995{
2996 __builtin_ia32_vzeroall();
2997}
2998
2999/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
3000///
3001/// \headerfile <x86intrin.h>
3002///
3003/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3004static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3005_mm256_zeroupper(void)
3006{
3007 __builtin_ia32_vzeroupper();
3008}
3009
3010/* Vector load with broadcast */
3011/// Loads a scalar single-precision floating point value from the
3012/// specified address pointed to by \a __a and broadcasts it to the elements
3013/// of a [4 x float] vector.
3014///
3015/// \headerfile <x86intrin.h>
3016///
3017/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3018///
3019/// \param __a
3020/// The single-precision floating point value to be broadcast.
3021/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3022/// equal to the broadcast value.
3023static __inline __m128 __DEFAULT_FN_ATTRS128
3025{
3026 struct __mm_broadcast_ss_struct {
3027 float __f;
3028 } __attribute__((__packed__, __may_alias__));
3029 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3030 return __extension__ (__m128){ __f, __f, __f, __f };
3031}
3032
3033/// Loads a scalar double-precision floating point value from the
3034/// specified address pointed to by \a __a and broadcasts it to the elements
3035/// of a [4 x double] vector.
3036///
3037/// \headerfile <x86intrin.h>
3038///
3039/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3040///
3041/// \param __a
3042/// The double-precision floating point value to be broadcast.
3043/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3044/// equal to the broadcast value.
3045static __inline __m256d __DEFAULT_FN_ATTRS
3047{
3048 struct __mm256_broadcast_sd_struct {
3049 double __d;
3050 } __attribute__((__packed__, __may_alias__));
3051 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3052 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3053}
3054
3055/// Loads a scalar single-precision floating point value from the
3056/// specified address pointed to by \a __a and broadcasts it to the elements
3057/// of a [8 x float] vector.
3058///
3059/// \headerfile <x86intrin.h>
3060///
3061/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3062///
3063/// \param __a
3064/// The single-precision floating point value to be broadcast.
3065/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3066/// equal to the broadcast value.
3067static __inline __m256 __DEFAULT_FN_ATTRS
3069{
3070 struct __mm256_broadcast_ss_struct {
3071 float __f;
3072 } __attribute__((__packed__, __may_alias__));
3073 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3074 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3075}
3076
3077/// Loads the data from a 128-bit vector of [2 x double] from the
3078/// specified address pointed to by \a __a and broadcasts it to 128-bit
3079/// elements in a 256-bit vector of [4 x double].
3080///
3081/// \headerfile <x86intrin.h>
3082///
3083/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3084///
3085/// \param __a
3086/// The 128-bit vector of [2 x double] to be broadcast.
3087/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3088/// equal to the broadcast value.
3089static __inline __m256d __DEFAULT_FN_ATTRS
3091{
3092 __m128d __b = _mm_loadu_pd((const double *)__a);
3093 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3094 0, 1, 0, 1);
3095}
3096
3097/// Loads the data from a 128-bit vector of [4 x float] from the
3098/// specified address pointed to by \a __a and broadcasts it to 128-bit
3099/// elements in a 256-bit vector of [8 x float].
3100///
3101/// \headerfile <x86intrin.h>
3102///
3103/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3104///
3105/// \param __a
3106/// The 128-bit vector of [4 x float] to be broadcast.
3107/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3108/// equal to the broadcast value.
3109static __inline __m256 __DEFAULT_FN_ATTRS
3111{
3112 __m128 __b = _mm_loadu_ps((const float *)__a);
3113 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3114 0, 1, 2, 3, 0, 1, 2, 3);
3115}
3116
3117/* SIMD load ops */
3118/// Loads 4 double-precision floating point values from a 32-byte aligned
3119/// memory location pointed to by \a __p into a vector of [4 x double].
3120///
3121/// \headerfile <x86intrin.h>
3122///
3123/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3124///
3125/// \param __p
3126/// A 32-byte aligned pointer to a memory location containing
3127/// double-precision floating point values.
3128/// \returns A 256-bit vector of [4 x double] containing the moved values.
3129static __inline __m256d __DEFAULT_FN_ATTRS
3130_mm256_load_pd(double const *__p)
3131{
3132 return *(const __m256d *)__p;
3133}
3134
3135/// Loads 8 single-precision floating point values from a 32-byte aligned
3136/// memory location pointed to by \a __p into a vector of [8 x float].
3137///
3138/// \headerfile <x86intrin.h>
3139///
3140/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3141///
3142/// \param __p
3143/// A 32-byte aligned pointer to a memory location containing float values.
3144/// \returns A 256-bit vector of [8 x float] containing the moved values.
3145static __inline __m256 __DEFAULT_FN_ATTRS
3146_mm256_load_ps(float const *__p)
3147{
3148 return *(const __m256 *)__p;
3149}
3150
3151/// Loads 4 double-precision floating point values from an unaligned
3152/// memory location pointed to by \a __p into a vector of [4 x double].
3153///
3154/// \headerfile <x86intrin.h>
3155///
3156/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3157///
3158/// \param __p
3159/// A pointer to a memory location containing double-precision floating
3160/// point values.
3161/// \returns A 256-bit vector of [4 x double] containing the moved values.
3162static __inline __m256d __DEFAULT_FN_ATTRS
3163_mm256_loadu_pd(double const *__p)
3164{
3165 struct __loadu_pd {
3166 __m256d_u __v;
3167 } __attribute__((__packed__, __may_alias__));
3168 return ((const struct __loadu_pd*)__p)->__v;
3169}
3170
3171/// Loads 8 single-precision floating point values from an unaligned
3172/// memory location pointed to by \a __p into a vector of [8 x float].
3173///
3174/// \headerfile <x86intrin.h>
3175///
3176/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3177///
3178/// \param __p
3179/// A pointer to a memory location containing single-precision floating
3180/// point values.
3181/// \returns A 256-bit vector of [8 x float] containing the moved values.
3182static __inline __m256 __DEFAULT_FN_ATTRS
3184{
3185 struct __loadu_ps {
3186 __m256_u __v;
3187 } __attribute__((__packed__, __may_alias__));
3188 return ((const struct __loadu_ps*)__p)->__v;
3189}
3190
3191/// Loads 256 bits of integer data from a 32-byte aligned memory
3192/// location pointed to by \a __p into elements of a 256-bit integer vector.
3193///
3194/// \headerfile <x86intrin.h>
3195///
3196/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3197///
3198/// \param __p
3199/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3200/// values.
3201/// \returns A 256-bit integer vector containing the moved values.
3202static __inline __m256i __DEFAULT_FN_ATTRS
3203_mm256_load_si256(__m256i const *__p)
3204{
3205 return *__p;
3206}
3207
3208/// Loads 256 bits of integer data from an unaligned memory location
3209/// pointed to by \a __p into a 256-bit integer vector.
3210///
3211/// \headerfile <x86intrin.h>
3212///
3213/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3214///
3215/// \param __p
3216/// A pointer to a 256-bit integer vector containing integer values.
3217/// \returns A 256-bit integer vector containing the moved values.
3218static __inline __m256i __DEFAULT_FN_ATTRS
3219_mm256_loadu_si256(__m256i_u const *__p)
3220{
3221 struct __loadu_si256 {
3222 __m256i_u __v;
3223 } __attribute__((__packed__, __may_alias__));
3224 return ((const struct __loadu_si256*)__p)->__v;
3225}
3226
3227/// Loads 256 bits of integer data from an unaligned memory location
3228/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3229/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3230/// line boundary.
3231///
3232/// \headerfile <x86intrin.h>
3233///
3234/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3235///
3236/// \param __p
3237/// A pointer to a 256-bit integer vector containing integer values.
3238/// \returns A 256-bit integer vector containing the moved values.
3239static __inline __m256i __DEFAULT_FN_ATTRS
3240_mm256_lddqu_si256(__m256i_u const *__p)
3241{
3242 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3243}
3244
3245/* SIMD store ops */
3246/// Stores double-precision floating point values from a 256-bit vector
3247/// of [4 x double] to a 32-byte aligned memory location pointed to by
3248/// \a __p.
3249///
3250/// \headerfile <x86intrin.h>
3251///
3252/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3253///
3254/// \param __p
3255/// A 32-byte aligned pointer to a memory location that will receive the
3256/// double-precision floaing point values.
3257/// \param __a
3258/// A 256-bit vector of [4 x double] containing the values to be moved.
3259static __inline void __DEFAULT_FN_ATTRS
3260_mm256_store_pd(double *__p, __m256d __a)
3261{
3262 *(__m256d *)__p = __a;
3263}
3264
3265/// Stores single-precision floating point values from a 256-bit vector
3266/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3267///
3268/// \headerfile <x86intrin.h>
3269///
3270/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3271///
3272/// \param __p
3273/// A 32-byte aligned pointer to a memory location that will receive the
3274/// float values.
3275/// \param __a
3276/// A 256-bit vector of [8 x float] containing the values to be moved.
3277static __inline void __DEFAULT_FN_ATTRS
3278_mm256_store_ps(float *__p, __m256 __a)
3279{
3280 *(__m256 *)__p = __a;
3281}
3282
3283/// Stores double-precision floating point values from a 256-bit vector
3284/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3285///
3286/// \headerfile <x86intrin.h>
3287///
3288/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3289///
3290/// \param __p
3291/// A pointer to a memory location that will receive the double-precision
3292/// floating point values.
3293/// \param __a
3294/// A 256-bit vector of [4 x double] containing the values to be moved.
3295static __inline void __DEFAULT_FN_ATTRS
3296_mm256_storeu_pd(double *__p, __m256d __a)
3297{
3298 struct __storeu_pd {
3299 __m256d_u __v;
3300 } __attribute__((__packed__, __may_alias__));
3301 ((struct __storeu_pd*)__p)->__v = __a;
3302}
3303
3304/// Stores single-precision floating point values from a 256-bit vector
3305/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3306///
3307/// \headerfile <x86intrin.h>
3308///
3309/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3310///
3311/// \param __p
3312/// A pointer to a memory location that will receive the float values.
3313/// \param __a
3314/// A 256-bit vector of [8 x float] containing the values to be moved.
3315static __inline void __DEFAULT_FN_ATTRS
3316_mm256_storeu_ps(float *__p, __m256 __a)
3317{
3318 struct __storeu_ps {
3319 __m256_u __v;
3320 } __attribute__((__packed__, __may_alias__));
3321 ((struct __storeu_ps*)__p)->__v = __a;
3322}
3323
3324/// Stores integer values from a 256-bit integer vector to a 32-byte
3325/// aligned memory location pointed to by \a __p.
3326///
3327/// \headerfile <x86intrin.h>
3328///
3329/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3330///
3331/// \param __p
3332/// A 32-byte aligned pointer to a memory location that will receive the
3333/// integer values.
3334/// \param __a
3335/// A 256-bit integer vector containing the values to be moved.
3336static __inline void __DEFAULT_FN_ATTRS
3337_mm256_store_si256(__m256i *__p, __m256i __a)
3338{
3339 *__p = __a;
3340}
3341
3342/// Stores integer values from a 256-bit integer vector to an unaligned
3343/// memory location pointed to by \a __p.
3344///
3345/// \headerfile <x86intrin.h>
3346///
3347/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3348///
3349/// \param __p
3350/// A pointer to a memory location that will receive the integer values.
3351/// \param __a
3352/// A 256-bit integer vector containing the values to be moved.
3353static __inline void __DEFAULT_FN_ATTRS
3354_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3355{
3356 struct __storeu_si256 {
3357 __m256i_u __v;
3358 } __attribute__((__packed__, __may_alias__));
3359 ((struct __storeu_si256*)__p)->__v = __a;
3360}
3361
3362/* Conditional load ops */
3363/// Conditionally loads double-precision floating point elements from a
3364/// memory location pointed to by \a __p into a 128-bit vector of
3365/// [2 x double], depending on the mask bits associated with each data
3366/// element.
3367///
3368/// \headerfile <x86intrin.h>
3369///
3370/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3371///
3372/// \param __p
3373/// A pointer to a memory location that contains the double-precision
3374/// floating point values.
3375/// \param __m
3376/// A 128-bit integer vector containing the mask. The most significant bit of
3377/// each data element represents the mask bits. If a mask bit is zero, the
3378/// corresponding value in the memory location is not loaded and the
3379/// corresponding field in the return value is set to zero.
3380/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3381static __inline __m128d __DEFAULT_FN_ATTRS128
3382_mm_maskload_pd(double const *__p, __m128i __m)
3383{
3384 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3385}
3386
3387/// Conditionally loads double-precision floating point elements from a
3388/// memory location pointed to by \a __p into a 256-bit vector of
3389/// [4 x double], depending on the mask bits associated with each data
3390/// element.
3391///
3392/// \headerfile <x86intrin.h>
3393///
3394/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3395///
3396/// \param __p
3397/// A pointer to a memory location that contains the double-precision
3398/// floating point values.
3399/// \param __m
3400/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3401/// significant bit of each quadword element represents the mask bits. If a
3402/// mask bit is zero, the corresponding value in the memory location is not
3403/// loaded and the corresponding field in the return value is set to zero.
3404/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3405static __inline __m256d __DEFAULT_FN_ATTRS
3406_mm256_maskload_pd(double const *__p, __m256i __m)
3407{
3408 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3409 (__v4di)__m);
3410}
3411
3412/// Conditionally loads single-precision floating point elements from a
3413/// memory location pointed to by \a __p into a 128-bit vector of
3414/// [4 x float], depending on the mask bits associated with each data
3415/// element.
3416///
3417/// \headerfile <x86intrin.h>
3418///
3419/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3420///
3421/// \param __p
3422/// A pointer to a memory location that contains the single-precision
3423/// floating point values.
3424/// \param __m
3425/// A 128-bit integer vector containing the mask. The most significant bit of
3426/// each data element represents the mask bits. If a mask bit is zero, the
3427/// corresponding value in the memory location is not loaded and the
3428/// corresponding field in the return value is set to zero.
3429/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3430static __inline __m128 __DEFAULT_FN_ATTRS128
3431_mm_maskload_ps(float const *__p, __m128i __m)
3432{
3433 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3434}
3435
3436/// Conditionally loads single-precision floating point elements from a
3437/// memory location pointed to by \a __p into a 256-bit vector of
3438/// [8 x float], depending on the mask bits associated with each data
3439/// element.
3440///
3441/// \headerfile <x86intrin.h>
3442///
3443/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3444///
3445/// \param __p
3446/// A pointer to a memory location that contains the single-precision
3447/// floating point values.
3448/// \param __m
3449/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3450/// significant bit of each dword element represents the mask bits. If a mask
3451/// bit is zero, the corresponding value in the memory location is not loaded
3452/// and the corresponding field in the return value is set to zero.
3453/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3454static __inline __m256 __DEFAULT_FN_ATTRS
3455_mm256_maskload_ps(float const *__p, __m256i __m)
3456{
3457 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3458}
3459
3460/* Conditional store ops */
3461/// Moves single-precision floating point values from a 256-bit vector
3462/// of [8 x float] to a memory location pointed to by \a __p, according to
3463/// the specified mask.
3464///
3465/// \headerfile <x86intrin.h>
3466///
3467/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3468///
3469/// \param __p
3470/// A pointer to a memory location that will receive the float values.
3471/// \param __m
3472/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3473/// significant bit of each dword element in the mask vector represents the
3474/// mask bits. If a mask bit is zero, the corresponding value from vector
3475/// \a __a is not stored and the corresponding field in the memory location
3476/// pointed to by \a __p is not changed.
3477/// \param __a
3478/// A 256-bit vector of [8 x float] containing the values to be stored.
3479static __inline void __DEFAULT_FN_ATTRS
3480_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3481{
3482 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3483}
3484
3485/// Moves double-precision values from a 128-bit vector of [2 x double]
3486/// to a memory location pointed to by \a __p, according to the specified
3487/// mask.
3488///
3489/// \headerfile <x86intrin.h>
3490///
3491/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3492///
3493/// \param __p
3494/// A pointer to a memory location that will receive the float values.
3495/// \param __m
3496/// A 128-bit integer vector containing the mask. The most significant bit of
3497/// each field in the mask vector represents the mask bits. If a mask bit is
3498/// zero, the corresponding value from vector \a __a is not stored and the
3499/// corresponding field in the memory location pointed to by \a __p is not
3500/// changed.
3501/// \param __a
3502/// A 128-bit vector of [2 x double] containing the values to be stored.
3503static __inline void __DEFAULT_FN_ATTRS128
3504_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3505{
3506 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3507}
3508
3509/// Moves double-precision values from a 256-bit vector of [4 x double]
3510/// to a memory location pointed to by \a __p, according to the specified
3511/// mask.
3512///
3513/// \headerfile <x86intrin.h>
3514///
3515/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3516///
3517/// \param __p
3518/// A pointer to a memory location that will receive the float values.
3519/// \param __m
3520/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3521/// significant bit of each quadword element in the mask vector represents
3522/// the mask bits. If a mask bit is zero, the corresponding value from vector
3523/// __a is not stored and the corresponding field in the memory location
3524/// pointed to by \a __p is not changed.
3525/// \param __a
3526/// A 256-bit vector of [4 x double] containing the values to be stored.
3527static __inline void __DEFAULT_FN_ATTRS
3528_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3529{
3530 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3531}
3532
3533/// Moves single-precision floating point values from a 128-bit vector
3534/// of [4 x float] to a memory location pointed to by \a __p, according to
3535/// the specified mask.
3536///
3537/// \headerfile <x86intrin.h>
3538///
3539/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3540///
3541/// \param __p
3542/// A pointer to a memory location that will receive the float values.
3543/// \param __m
3544/// A 128-bit integer vector containing the mask. The most significant bit of
3545/// each field in the mask vector represents the mask bits. If a mask bit is
3546/// zero, the corresponding value from vector __a is not stored and the
3547/// corresponding field in the memory location pointed to by \a __p is not
3548/// changed.
3549/// \param __a
3550/// A 128-bit vector of [4 x float] containing the values to be stored.
3551static __inline void __DEFAULT_FN_ATTRS128
3552_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3553{
3554 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3555}
3556
3557/* Cacheability support ops */
3558/// Moves integer data from a 256-bit integer vector to a 32-byte
3559/// aligned memory location. To minimize caching, the data is flagged as
3560/// non-temporal (unlikely to be used again soon).
3561///
3562/// \headerfile <x86intrin.h>
3563///
3564/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3565///
3566/// \param __a
3567/// A pointer to a 32-byte aligned memory location that will receive the
3568/// integer values.
3569/// \param __b
3570/// A 256-bit integer vector containing the values to be moved.
3571static __inline void __DEFAULT_FN_ATTRS
3573{
3574 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3575 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3576}
3577
3578/// Moves double-precision values from a 256-bit vector of [4 x double]
3579/// to a 32-byte aligned memory location. To minimize caching, the data is
3580/// flagged as non-temporal (unlikely to be used again soon).
3581///
3582/// \headerfile <x86intrin.h>
3583///
3584/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3585///
3586/// \param __a
3587/// A pointer to a 32-byte aligned memory location that will receive the
3588/// double-precision floating-point values.
3589/// \param __b
3590/// A 256-bit vector of [4 x double] containing the values to be moved.
3591static __inline void __DEFAULT_FN_ATTRS
3592_mm256_stream_pd(void *__a, __m256d __b)
3593{
3594 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3595 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3596}
3597
3598/// Moves single-precision floating point values from a 256-bit vector
3599/// of [8 x float] to a 32-byte aligned memory location. To minimize
3600/// caching, the data is flagged as non-temporal (unlikely to be used again
3601/// soon).
3602///
3603/// \headerfile <x86intrin.h>
3604///
3605/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3606///
3607/// \param __p
3608/// A pointer to a 32-byte aligned memory location that will receive the
3609/// single-precision floating point values.
3610/// \param __a
3611/// A 256-bit vector of [8 x float] containing the values to be moved.
3612static __inline void __DEFAULT_FN_ATTRS
3614{
3615 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3616 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3617}
3618
3619/* Create vectors */
3620/// Create a 256-bit vector of [4 x double] with undefined values.
3621///
3622/// \headerfile <x86intrin.h>
3623///
3624/// This intrinsic has no corresponding instruction.
3625///
3626/// \returns A 256-bit vector of [4 x double] containing undefined values.
3627static __inline__ __m256d __DEFAULT_FN_ATTRS
3629{
3630 return (__m256d)__builtin_ia32_undef256();
3631}
3632
3633/// Create a 256-bit vector of [8 x float] with undefined values.
3634///
3635/// \headerfile <x86intrin.h>
3636///
3637/// This intrinsic has no corresponding instruction.
3638///
3639/// \returns A 256-bit vector of [8 x float] containing undefined values.
3640static __inline__ __m256 __DEFAULT_FN_ATTRS
3642{
3643 return (__m256)__builtin_ia32_undef256();
3644}
3645
3646/// Create a 256-bit integer vector with undefined values.
3647///
3648/// \headerfile <x86intrin.h>
3649///
3650/// This intrinsic has no corresponding instruction.
3651///
3652/// \returns A 256-bit integer vector containing undefined values.
3653static __inline__ __m256i __DEFAULT_FN_ATTRS
3655{
3656 return (__m256i)__builtin_ia32_undef256();
3657}
3658
3659/// Constructs a 256-bit floating-point vector of [4 x double]
3660/// initialized with the specified double-precision floating-point values.
3661///
3662/// \headerfile <x86intrin.h>
3663///
3664/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3665/// instruction.
3666///
3667/// \param __a
3668/// A double-precision floating-point value used to initialize bits [255:192]
3669/// of the result.
3670/// \param __b
3671/// A double-precision floating-point value used to initialize bits [191:128]
3672/// of the result.
3673/// \param __c
3674/// A double-precision floating-point value used to initialize bits [127:64]
3675/// of the result.
3676/// \param __d
3677/// A double-precision floating-point value used to initialize bits [63:0]
3678/// of the result.
3679/// \returns An initialized 256-bit floating-point vector of [4 x double].
3680static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3681_mm256_set_pd(double __a, double __b, double __c, double __d)
3682{
3683 return __extension__ (__m256d){ __d, __c, __b, __a };
3684}
3685
3686/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3687/// with the specified single-precision floating-point values.
3688///
3689/// \headerfile <x86intrin.h>
3690///
3691/// This intrinsic is a utility function and does not correspond to a specific
3692/// instruction.
3693///
3694/// \param __a
3695/// A single-precision floating-point value used to initialize bits [255:224]
3696/// of the result.
3697/// \param __b
3698/// A single-precision floating-point value used to initialize bits [223:192]
3699/// of the result.
3700/// \param __c
3701/// A single-precision floating-point value used to initialize bits [191:160]
3702/// of the result.
3703/// \param __d
3704/// A single-precision floating-point value used to initialize bits [159:128]
3705/// of the result.
3706/// \param __e
3707/// A single-precision floating-point value used to initialize bits [127:96]
3708/// of the result.
3709/// \param __f
3710/// A single-precision floating-point value used to initialize bits [95:64]
3711/// of the result.
3712/// \param __g
3713/// A single-precision floating-point value used to initialize bits [63:32]
3714/// of the result.
3715/// \param __h
3716/// A single-precision floating-point value used to initialize bits [31:0]
3717/// of the result.
3718/// \returns An initialized 256-bit floating-point vector of [8 x float].
3719static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3720_mm256_set_ps(float __a, float __b, float __c, float __d,
3721 float __e, float __f, float __g, float __h)
3722{
3723 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3724}
3725
3726/// Constructs a 256-bit integer vector initialized with the specified
3727/// 32-bit integral values.
3728///
3729/// \headerfile <x86intrin.h>
3730///
3731/// This intrinsic is a utility function and does not correspond to a specific
3732/// instruction.
3733///
3734/// \param __i0
3735/// A 32-bit integral value used to initialize bits [255:224] of the result.
3736/// \param __i1
3737/// A 32-bit integral value used to initialize bits [223:192] of the result.
3738/// \param __i2
3739/// A 32-bit integral value used to initialize bits [191:160] of the result.
3740/// \param __i3
3741/// A 32-bit integral value used to initialize bits [159:128] of the result.
3742/// \param __i4
3743/// A 32-bit integral value used to initialize bits [127:96] of the result.
3744/// \param __i5
3745/// A 32-bit integral value used to initialize bits [95:64] of the result.
3746/// \param __i6
3747/// A 32-bit integral value used to initialize bits [63:32] of the result.
3748/// \param __i7
3749/// A 32-bit integral value used to initialize bits [31:0] of the result.
3750/// \returns An initialized 256-bit integer vector.
3751static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3752_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3753 int __i4, int __i5, int __i6, int __i7)
3754{
3755 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3756}
3757
3758/// Constructs a 256-bit integer vector initialized with the specified
3759/// 16-bit integral values.
3760///
3761/// \headerfile <x86intrin.h>
3762///
3763/// This intrinsic is a utility function and does not correspond to a specific
3764/// instruction.
3765///
3766/// \param __w15
3767/// A 16-bit integral value used to initialize bits [255:240] of the result.
3768/// \param __w14
3769/// A 16-bit integral value used to initialize bits [239:224] of the result.
3770/// \param __w13
3771/// A 16-bit integral value used to initialize bits [223:208] of the result.
3772/// \param __w12
3773/// A 16-bit integral value used to initialize bits [207:192] of the result.
3774/// \param __w11
3775/// A 16-bit integral value used to initialize bits [191:176] of the result.
3776/// \param __w10
3777/// A 16-bit integral value used to initialize bits [175:160] of the result.
3778/// \param __w09
3779/// A 16-bit integral value used to initialize bits [159:144] of the result.
3780/// \param __w08
3781/// A 16-bit integral value used to initialize bits [143:128] of the result.
3782/// \param __w07
3783/// A 16-bit integral value used to initialize bits [127:112] of the result.
3784/// \param __w06
3785/// A 16-bit integral value used to initialize bits [111:96] of the result.
3786/// \param __w05
3787/// A 16-bit integral value used to initialize bits [95:80] of the result.
3788/// \param __w04
3789/// A 16-bit integral value used to initialize bits [79:64] of the result.
3790/// \param __w03
3791/// A 16-bit integral value used to initialize bits [63:48] of the result.
3792/// \param __w02
3793/// A 16-bit integral value used to initialize bits [47:32] of the result.
3794/// \param __w01
3795/// A 16-bit integral value used to initialize bits [31:16] of the result.
3796/// \param __w00
3797/// A 16-bit integral value used to initialize bits [15:0] of the result.
3798/// \returns An initialized 256-bit integer vector.
3799static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3800_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3801 short __w11, short __w10, short __w09, short __w08,
3802 short __w07, short __w06, short __w05, short __w04,
3803 short __w03, short __w02, short __w01, short __w00)
3804{
3805 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3806 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3807}
3808
3809/// Constructs a 256-bit integer vector initialized with the specified
3810/// 8-bit integral values.
3811///
3812/// \headerfile <x86intrin.h>
3813///
3814/// This intrinsic is a utility function and does not correspond to a specific
3815/// instruction.
3816///
3817/// \param __b31
3818/// An 8-bit integral value used to initialize bits [255:248] of the result.
3819/// \param __b30
3820/// An 8-bit integral value used to initialize bits [247:240] of the result.
3821/// \param __b29
3822/// An 8-bit integral value used to initialize bits [239:232] of the result.
3823/// \param __b28
3824/// An 8-bit integral value used to initialize bits [231:224] of the result.
3825/// \param __b27
3826/// An 8-bit integral value used to initialize bits [223:216] of the result.
3827/// \param __b26
3828/// An 8-bit integral value used to initialize bits [215:208] of the result.
3829/// \param __b25
3830/// An 8-bit integral value used to initialize bits [207:200] of the result.
3831/// \param __b24
3832/// An 8-bit integral value used to initialize bits [199:192] of the result.
3833/// \param __b23
3834/// An 8-bit integral value used to initialize bits [191:184] of the result.
3835/// \param __b22
3836/// An 8-bit integral value used to initialize bits [183:176] of the result.
3837/// \param __b21
3838/// An 8-bit integral value used to initialize bits [175:168] of the result.
3839/// \param __b20
3840/// An 8-bit integral value used to initialize bits [167:160] of the result.
3841/// \param __b19
3842/// An 8-bit integral value used to initialize bits [159:152] of the result.
3843/// \param __b18
3844/// An 8-bit integral value used to initialize bits [151:144] of the result.
3845/// \param __b17
3846/// An 8-bit integral value used to initialize bits [143:136] of the result.
3847/// \param __b16
3848/// An 8-bit integral value used to initialize bits [135:128] of the result.
3849/// \param __b15
3850/// An 8-bit integral value used to initialize bits [127:120] of the result.
3851/// \param __b14
3852/// An 8-bit integral value used to initialize bits [119:112] of the result.
3853/// \param __b13
3854/// An 8-bit integral value used to initialize bits [111:104] of the result.
3855/// \param __b12
3856/// An 8-bit integral value used to initialize bits [103:96] of the result.
3857/// \param __b11
3858/// An 8-bit integral value used to initialize bits [95:88] of the result.
3859/// \param __b10
3860/// An 8-bit integral value used to initialize bits [87:80] of the result.
3861/// \param __b09
3862/// An 8-bit integral value used to initialize bits [79:72] of the result.
3863/// \param __b08
3864/// An 8-bit integral value used to initialize bits [71:64] of the result.
3865/// \param __b07
3866/// An 8-bit integral value used to initialize bits [63:56] of the result.
3867/// \param __b06
3868/// An 8-bit integral value used to initialize bits [55:48] of the result.
3869/// \param __b05
3870/// An 8-bit integral value used to initialize bits [47:40] of the result.
3871/// \param __b04
3872/// An 8-bit integral value used to initialize bits [39:32] of the result.
3873/// \param __b03
3874/// An 8-bit integral value used to initialize bits [31:24] of the result.
3875/// \param __b02
3876/// An 8-bit integral value used to initialize bits [23:16] of the result.
3877/// \param __b01
3878/// An 8-bit integral value used to initialize bits [15:8] of the result.
3879/// \param __b00
3880/// An 8-bit integral value used to initialize bits [7:0] of the result.
3881/// \returns An initialized 256-bit integer vector.
3882static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3883_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3884 char __b27, char __b26, char __b25, char __b24,
3885 char __b23, char __b22, char __b21, char __b20,
3886 char __b19, char __b18, char __b17, char __b16,
3887 char __b15, char __b14, char __b13, char __b12,
3888 char __b11, char __b10, char __b09, char __b08,
3889 char __b07, char __b06, char __b05, char __b04,
3890 char __b03, char __b02, char __b01, char __b00)
3891{
3892 return __extension__ (__m256i)(__v32qi){
3893 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3894 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3895 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3896 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3897 };
3898}
3899
3900/// Constructs a 256-bit integer vector initialized with the specified
3901/// 64-bit integral values.
3902///
3903/// \headerfile <x86intrin.h>
3904///
3905/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3906/// instruction.
3907///
3908/// \param __a
3909/// A 64-bit integral value used to initialize bits [255:192] of the result.
3910/// \param __b
3911/// A 64-bit integral value used to initialize bits [191:128] of the result.
3912/// \param __c
3913/// A 64-bit integral value used to initialize bits [127:64] of the result.
3914/// \param __d
3915/// A 64-bit integral value used to initialize bits [63:0] of the result.
3916/// \returns An initialized 256-bit integer vector.
3917static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3918_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3919{
3920 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3921}
3922
3923/* Create vectors with elements in reverse order */
3924/// Constructs a 256-bit floating-point vector of [4 x double],
3925/// initialized in reverse order with the specified double-precision
3926/// floating-point values.
3927///
3928/// \headerfile <x86intrin.h>
3929///
3930/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3931/// instruction.
3932///
3933/// \param __a
3934/// A double-precision floating-point value used to initialize bits [63:0]
3935/// of the result.
3936/// \param __b
3937/// A double-precision floating-point value used to initialize bits [127:64]
3938/// of the result.
3939/// \param __c
3940/// A double-precision floating-point value used to initialize bits [191:128]
3941/// of the result.
3942/// \param __d
3943/// A double-precision floating-point value used to initialize bits [255:192]
3944/// of the result.
3945/// \returns An initialized 256-bit floating-point vector of [4 x double].
3946static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3947_mm256_setr_pd(double __a, double __b, double __c, double __d)
3948{
3949 return _mm256_set_pd(__d, __c, __b, __a);
3950}
3951
3952/// Constructs a 256-bit floating-point vector of [8 x float],
3953/// initialized in reverse order with the specified single-precision
3954/// float-point values.
3955///
3956/// \headerfile <x86intrin.h>
3957///
3958/// This intrinsic is a utility function and does not correspond to a specific
3959/// instruction.
3960///
3961/// \param __a
3962/// A single-precision floating-point value used to initialize bits [31:0]
3963/// of the result.
3964/// \param __b
3965/// A single-precision floating-point value used to initialize bits [63:32]
3966/// of the result.
3967/// \param __c
3968/// A single-precision floating-point value used to initialize bits [95:64]
3969/// of the result.
3970/// \param __d
3971/// A single-precision floating-point value used to initialize bits [127:96]
3972/// of the result.
3973/// \param __e
3974/// A single-precision floating-point value used to initialize bits [159:128]
3975/// of the result.
3976/// \param __f
3977/// A single-precision floating-point value used to initialize bits [191:160]
3978/// of the result.
3979/// \param __g
3980/// A single-precision floating-point value used to initialize bits [223:192]
3981/// of the result.
3982/// \param __h
3983/// A single-precision floating-point value used to initialize bits [255:224]
3984/// of the result.
3985/// \returns An initialized 256-bit floating-point vector of [8 x float].
3986static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3987_mm256_setr_ps(float __a, float __b, float __c, float __d,
3988 float __e, float __f, float __g, float __h)
3989{
3990 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3991}
3992
3993/// Constructs a 256-bit integer vector, initialized in reverse order
3994/// with the specified 32-bit integral values.
3995///
3996/// \headerfile <x86intrin.h>
3997///
3998/// This intrinsic is a utility function and does not correspond to a specific
3999/// instruction.
4000///
4001/// \param __i0
4002/// A 32-bit integral value used to initialize bits [31:0] of the result.
4003/// \param __i1
4004/// A 32-bit integral value used to initialize bits [63:32] of the result.
4005/// \param __i2
4006/// A 32-bit integral value used to initialize bits [95:64] of the result.
4007/// \param __i3
4008/// A 32-bit integral value used to initialize bits [127:96] of the result.
4009/// \param __i4
4010/// A 32-bit integral value used to initialize bits [159:128] of the result.
4011/// \param __i5
4012/// A 32-bit integral value used to initialize bits [191:160] of the result.
4013/// \param __i6
4014/// A 32-bit integral value used to initialize bits [223:192] of the result.
4015/// \param __i7
4016/// A 32-bit integral value used to initialize bits [255:224] of the result.
4017/// \returns An initialized 256-bit integer vector.
4018static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4019_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4020 int __i4, int __i5, int __i6, int __i7)
4021{
4022 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
4023}
4024
4025/// Constructs a 256-bit integer vector, initialized in reverse order
4026/// with the specified 16-bit integral values.
4027///
4028/// \headerfile <x86intrin.h>
4029///
4030/// This intrinsic is a utility function and does not correspond to a specific
4031/// instruction.
4032///
4033/// \param __w15
4034/// A 16-bit integral value used to initialize bits [15:0] of the result.
4035/// \param __w14
4036/// A 16-bit integral value used to initialize bits [31:16] of the result.
4037/// \param __w13
4038/// A 16-bit integral value used to initialize bits [47:32] of the result.
4039/// \param __w12
4040/// A 16-bit integral value used to initialize bits [63:48] of the result.
4041/// \param __w11
4042/// A 16-bit integral value used to initialize bits [79:64] of the result.
4043/// \param __w10
4044/// A 16-bit integral value used to initialize bits [95:80] of the result.
4045/// \param __w09
4046/// A 16-bit integral value used to initialize bits [111:96] of the result.
4047/// \param __w08
4048/// A 16-bit integral value used to initialize bits [127:112] of the result.
4049/// \param __w07
4050/// A 16-bit integral value used to initialize bits [143:128] of the result.
4051/// \param __w06
4052/// A 16-bit integral value used to initialize bits [159:144] of the result.
4053/// \param __w05
4054/// A 16-bit integral value used to initialize bits [175:160] of the result.
4055/// \param __w04
4056/// A 16-bit integral value used to initialize bits [191:176] of the result.
4057/// \param __w03
4058/// A 16-bit integral value used to initialize bits [207:192] of the result.
4059/// \param __w02
4060/// A 16-bit integral value used to initialize bits [223:208] of the result.
4061/// \param __w01
4062/// A 16-bit integral value used to initialize bits [239:224] of the result.
4063/// \param __w00
4064/// A 16-bit integral value used to initialize bits [255:240] of the result.
4065/// \returns An initialized 256-bit integer vector.
4066static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4067_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4068 short __w11, short __w10, short __w09, short __w08,
4069 short __w07, short __w06, short __w05, short __w04,
4070 short __w03, short __w02, short __w01, short __w00)
4071{
4072 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4073 __w04, __w05, __w06, __w07,
4074 __w08, __w09, __w10, __w11,
4075 __w12, __w13, __w14, __w15);
4076}
4077
4078/// Constructs a 256-bit integer vector, initialized in reverse order
4079/// with the specified 8-bit integral values.
4080///
4081/// \headerfile <x86intrin.h>
4082///
4083/// This intrinsic is a utility function and does not correspond to a specific
4084/// instruction.
4085///
4086/// \param __b31
4087/// An 8-bit integral value used to initialize bits [7:0] of the result.
4088/// \param __b30
4089/// An 8-bit integral value used to initialize bits [15:8] of the result.
4090/// \param __b29
4091/// An 8-bit integral value used to initialize bits [23:16] of the result.
4092/// \param __b28
4093/// An 8-bit integral value used to initialize bits [31:24] of the result.
4094/// \param __b27
4095/// An 8-bit integral value used to initialize bits [39:32] of the result.
4096/// \param __b26
4097/// An 8-bit integral value used to initialize bits [47:40] of the result.
4098/// \param __b25
4099/// An 8-bit integral value used to initialize bits [55:48] of the result.
4100/// \param __b24
4101/// An 8-bit integral value used to initialize bits [63:56] of the result.
4102/// \param __b23
4103/// An 8-bit integral value used to initialize bits [71:64] of the result.
4104/// \param __b22
4105/// An 8-bit integral value used to initialize bits [79:72] of the result.
4106/// \param __b21
4107/// An 8-bit integral value used to initialize bits [87:80] of the result.
4108/// \param __b20
4109/// An 8-bit integral value used to initialize bits [95:88] of the result.
4110/// \param __b19
4111/// An 8-bit integral value used to initialize bits [103:96] of the result.
4112/// \param __b18
4113/// An 8-bit integral value used to initialize bits [111:104] of the result.
4114/// \param __b17
4115/// An 8-bit integral value used to initialize bits [119:112] of the result.
4116/// \param __b16
4117/// An 8-bit integral value used to initialize bits [127:120] of the result.
4118/// \param __b15
4119/// An 8-bit integral value used to initialize bits [135:128] of the result.
4120/// \param __b14
4121/// An 8-bit integral value used to initialize bits [143:136] of the result.
4122/// \param __b13
4123/// An 8-bit integral value used to initialize bits [151:144] of the result.
4124/// \param __b12
4125/// An 8-bit integral value used to initialize bits [159:152] of the result.
4126/// \param __b11
4127/// An 8-bit integral value used to initialize bits [167:160] of the result.
4128/// \param __b10
4129/// An 8-bit integral value used to initialize bits [175:168] of the result.
4130/// \param __b09
4131/// An 8-bit integral value used to initialize bits [183:176] of the result.
4132/// \param __b08
4133/// An 8-bit integral value used to initialize bits [191:184] of the result.
4134/// \param __b07
4135/// An 8-bit integral value used to initialize bits [199:192] of the result.
4136/// \param __b06
4137/// An 8-bit integral value used to initialize bits [207:200] of the result.
4138/// \param __b05
4139/// An 8-bit integral value used to initialize bits [215:208] of the result.
4140/// \param __b04
4141/// An 8-bit integral value used to initialize bits [223:216] of the result.
4142/// \param __b03
4143/// An 8-bit integral value used to initialize bits [231:224] of the result.
4144/// \param __b02
4145/// An 8-bit integral value used to initialize bits [239:232] of the result.
4146/// \param __b01
4147/// An 8-bit integral value used to initialize bits [247:240] of the result.
4148/// \param __b00
4149/// An 8-bit integral value used to initialize bits [255:248] of the result.
4150/// \returns An initialized 256-bit integer vector.
4151static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4152_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4153 char __b27, char __b26, char __b25, char __b24,
4154 char __b23, char __b22, char __b21, char __b20,
4155 char __b19, char __b18, char __b17, char __b16,
4156 char __b15, char __b14, char __b13, char __b12,
4157 char __b11, char __b10, char __b09, char __b08,
4158 char __b07, char __b06, char __b05, char __b04,
4159 char __b03, char __b02, char __b01, char __b00)
4160{
4161 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4162 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4163 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4164 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4165}
4166
4167/// Constructs a 256-bit integer vector, initialized in reverse order
4168/// with the specified 64-bit integral values.
4169///
4170/// \headerfile <x86intrin.h>
4171///
4172/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4173/// instruction.
4174///
4175/// \param __a
4176/// A 64-bit integral value used to initialize bits [63:0] of the result.
4177/// \param __b
4178/// A 64-bit integral value used to initialize bits [127:64] of the result.
4179/// \param __c
4180/// A 64-bit integral value used to initialize bits [191:128] of the result.
4181/// \param __d
4182/// A 64-bit integral value used to initialize bits [255:192] of the result.
4183/// \returns An initialized 256-bit integer vector.
4184static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4185_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4186{
4187 return _mm256_set_epi64x(__d, __c, __b, __a);
4188}
4189
4190/* Create vectors with repeated elements */
4191/// Constructs a 256-bit floating-point vector of [4 x double], with each
4192/// of the four double-precision floating-point vector elements set to the
4193/// specified double-precision floating-point value.
4194///
4195/// \headerfile <x86intrin.h>
4196///
4197/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4198///
4199/// \param __w
4200/// A double-precision floating-point value used to initialize each vector
4201/// element of the result.
4202/// \returns An initialized 256-bit floating-point vector of [4 x double].
4203static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4205{
4206 return _mm256_set_pd(__w, __w, __w, __w);
4207}
4208
4209/// Constructs a 256-bit floating-point vector of [8 x float], with each
4210/// of the eight single-precision floating-point vector elements set to the
4211/// specified single-precision floating-point value.
4212///
4213/// \headerfile <x86intrin.h>
4214///
4215/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4216/// instruction.
4217///
4218/// \param __w
4219/// A single-precision floating-point value used to initialize each vector
4220/// element of the result.
4221/// \returns An initialized 256-bit floating-point vector of [8 x float].
4222static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4224{
4225 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4226}
4227
4228/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4229/// 32-bit integral vector elements set to the specified 32-bit integral
4230/// value.
4231///
4232/// \headerfile <x86intrin.h>
4233///
4234/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4235/// instruction.
4236///
4237/// \param __i
4238/// A 32-bit integral value used to initialize each vector element of the
4239/// result.
4240/// \returns An initialized 256-bit integer vector of [8 x i32].
4241static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4243{
4244 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4245}
4246
4247/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4248/// 16-bit integral vector elements set to the specified 16-bit integral
4249/// value.
4250///
4251/// \headerfile <x86intrin.h>
4252///
4253/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4254///
4255/// \param __w
4256/// A 16-bit integral value used to initialize each vector element of the
4257/// result.
4258/// \returns An initialized 256-bit integer vector of [16 x i16].
4259static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4261{
4262 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4263 __w, __w, __w, __w, __w, __w, __w, __w);
4264}
4265
4266/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4267/// 8-bit integral vector elements set to the specified 8-bit integral value.
4268///
4269/// \headerfile <x86intrin.h>
4270///
4271/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4272///
4273/// \param __b
4274/// An 8-bit integral value used to initialize each vector element of the
4275/// result.
4276/// \returns An initialized 256-bit integer vector of [32 x i8].
4277static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4279{
4280 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4281 __b, __b, __b, __b, __b, __b, __b, __b,
4282 __b, __b, __b, __b, __b, __b, __b, __b,
4283 __b, __b, __b, __b, __b, __b, __b, __b);
4284}
4285
4286/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4287/// 64-bit integral vector elements set to the specified 64-bit integral
4288/// value.
4289///
4290/// \headerfile <x86intrin.h>
4291///
4292/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4293///
4294/// \param __q
4295/// A 64-bit integral value used to initialize each vector element of the
4296/// result.
4297/// \returns An initialized 256-bit integer vector of [4 x i64].
4298static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4300{
4301 return _mm256_set_epi64x(__q, __q, __q, __q);
4302}
4303
4304/* Create __zeroed vectors */
4305/// Constructs a 256-bit floating-point vector of [4 x double] with all
4306/// vector elements initialized to zero.
4307///
4308/// \headerfile <x86intrin.h>
4309///
4310/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4311///
4312/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4314 return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
4315}
4316
4317/// Constructs a 256-bit floating-point vector of [8 x float] with all
4318/// vector elements initialized to zero.
4319///
4320/// \headerfile <x86intrin.h>
4321///
4322/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4323///
4324/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4326 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4327}
4328
4329/// Constructs a 256-bit integer vector initialized to zero.
4330///
4331/// \headerfile <x86intrin.h>
4332///
4333/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4334///
4335/// \returns A 256-bit integer vector initialized to zero.
4336static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4338 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4339}
4340
4341/* Cast between vector types */
4342/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4343/// floating-point vector of [8 x float].
4344///
4345/// \headerfile <x86intrin.h>
4346///
4347/// This intrinsic has no corresponding instruction.
4348///
4349/// \param __a
4350/// A 256-bit floating-point vector of [4 x double].
4351/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4352/// bitwise pattern as the parameter.
4353static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4355{
4356 return (__m256)__a;
4357}
4358
4359/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4360/// integer vector.
4361///
4362/// \headerfile <x86intrin.h>
4363///
4364/// This intrinsic has no corresponding instruction.
4365///
4366/// \param __a
4367/// A 256-bit floating-point vector of [4 x double].
4368/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4369/// parameter.
4370static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4372{
4373 return (__m256i)__a;
4374}
4375
4376/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4377/// floating-point vector of [4 x double].
4378///
4379/// \headerfile <x86intrin.h>
4380///
4381/// This intrinsic has no corresponding instruction.
4382///
4383/// \param __a
4384/// A 256-bit floating-point vector of [8 x float].
4385/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4386/// bitwise pattern as the parameter.
4387static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4389{
4390 return (__m256d)__a;
4391}
4392
4393/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4394/// integer vector.
4395///
4396/// \headerfile <x86intrin.h>
4397///
4398/// This intrinsic has no corresponding instruction.
4399///
4400/// \param __a
4401/// A 256-bit floating-point vector of [8 x float].
4402/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4403/// parameter.
4404static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4406{
4407 return (__m256i)__a;
4408}
4409
4410/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4411/// of [8 x float].
4412///
4413/// \headerfile <x86intrin.h>
4414///
4415/// This intrinsic has no corresponding instruction.
4416///
4417/// \param __a
4418/// A 256-bit integer vector.
4419/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4420/// bitwise pattern as the parameter.
4421static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4423{
4424 return (__m256)__a;
4425}
4426
4427/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4428/// of [4 x double].
4429///
4430/// \headerfile <x86intrin.h>
4431///
4432/// This intrinsic has no corresponding instruction.
4433///
4434/// \param __a
4435/// A 256-bit integer vector.
4436/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4437/// bitwise pattern as the parameter.
4438static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4440{
4441 return (__m256d)__a;
4442}
4443
4444/// Returns the lower 128 bits of a 256-bit floating-point vector of
4445/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4446///
4447/// \headerfile <x86intrin.h>
4448///
4449/// This intrinsic has no corresponding instruction.
4450///
4451/// \param __a
4452/// A 256-bit floating-point vector of [4 x double].
4453/// \returns A 128-bit floating-point vector of [2 x double] containing the
4454/// lower 128 bits of the parameter.
4455static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4457{
4458 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4459}
4460
4461/// Returns the lower 128 bits of a 256-bit floating-point vector of
4462/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4463///
4464/// \headerfile <x86intrin.h>
4465///
4466/// This intrinsic has no corresponding instruction.
4467///
4468/// \param __a
4469/// A 256-bit floating-point vector of [8 x float].
4470/// \returns A 128-bit floating-point vector of [4 x float] containing the
4471/// lower 128 bits of the parameter.
4472static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4474{
4475 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4476}
4477
4478/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4479///
4480/// \headerfile <x86intrin.h>
4481///
4482/// This intrinsic has no corresponding instruction.
4483///
4484/// \param __a
4485/// A 256-bit integer vector.
4486/// \returns A 128-bit integer vector containing the lower 128 bits of the
4487/// parameter.
4488static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4490{
4491 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4492}
4493
4494/// Constructs a 256-bit floating-point vector of [4 x double] from a
4495/// 128-bit floating-point vector of [2 x double].
4496///
4497/// The lower 128 bits contain the value of the source vector. The contents
4498/// of the upper 128 bits are undefined.
4499///
4500/// \headerfile <x86intrin.h>
4501///
4502/// This intrinsic has no corresponding instruction.
4503///
4504/// \param __a
4505/// A 128-bit vector of [2 x double].
4506/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4507/// contain the value of the parameter. The contents of the upper 128 bits
4508/// are undefined.
4509static __inline __m256d __DEFAULT_FN_ATTRS
4511{
4512 return __builtin_shufflevector(
4513 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4514}
4515
4516/// Constructs a 256-bit floating-point vector of [8 x float] from a
4517/// 128-bit floating-point vector of [4 x float].
4518///
4519/// The lower 128 bits contain the value of the source vector. The contents
4520/// of the upper 128 bits are undefined.
4521///
4522/// \headerfile <x86intrin.h>
4523///
4524/// This intrinsic has no corresponding instruction.
4525///
4526/// \param __a
4527/// A 128-bit vector of [4 x float].
4528/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4529/// contain the value of the parameter. The contents of the upper 128 bits
4530/// are undefined.
4531static __inline __m256 __DEFAULT_FN_ATTRS
4533{
4534 return __builtin_shufflevector((__v4sf)__a,
4535 (__v4sf)__builtin_nondeterministic_value(__a),
4536 0, 1, 2, 3, 4, 5, 6, 7);
4537}
4538
4539/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4540///
4541/// The lower 128 bits contain the value of the source vector. The contents
4542/// of the upper 128 bits are undefined.
4543///
4544/// \headerfile <x86intrin.h>
4545///
4546/// This intrinsic has no corresponding instruction.
4547///
4548/// \param __a
4549/// A 128-bit integer vector.
4550/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4551/// the parameter. The contents of the upper 128 bits are undefined.
4552static __inline __m256i __DEFAULT_FN_ATTRS
4554{
4555 return __builtin_shufflevector(
4556 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4557}
4558
4559/// Constructs a 256-bit floating-point vector of [4 x double] from a
4560/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4561/// contain the value of the source vector. The upper 128 bits are set
4562/// to zero.
4563///
4564/// \headerfile <x86intrin.h>
4565///
4566/// This intrinsic has no corresponding instruction.
4567///
4568/// \param __a
4569/// A 128-bit vector of [2 x double].
4570/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4571/// contain the value of the parameter. The upper 128 bits are set to zero.
4572static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4574 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4575}
4576
4577/// Constructs a 256-bit floating-point vector of [8 x float] from a
4578/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4579/// the value of the source vector. The upper 128 bits are set to zero.
4580///
4581/// \headerfile <x86intrin.h>
4582///
4583/// This intrinsic has no corresponding instruction.
4584///
4585/// \param __a
4586/// A 128-bit vector of [4 x float].
4587/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4588/// contain the value of the parameter. The upper 128 bits are set to zero.
4589static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4591 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4592}
4593
4594/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4595/// The lower 128 bits contain the value of the source vector. The upper
4596/// 128 bits are set to zero.
4597///
4598/// \headerfile <x86intrin.h>
4599///
4600/// This intrinsic has no corresponding instruction.
4601///
4602/// \param __a
4603/// A 128-bit integer vector.
4604/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4605/// the parameter. The upper 128 bits are set to zero.
4606static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4608 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4609}
4610
4611/*
4612 Vector insert.
4613 We use macros rather than inlines because we only want to accept
4614 invocations where the immediate M is a constant expression.
4615*/
4616/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4617/// a 256-bit vector of [8 x float] given in the first parameter, and then
4618/// replacing either the upper or the lower 128 bits with the contents of a
4619/// 128-bit vector of [4 x float] in the second parameter.
4620///
4621/// The immediate integer parameter determines between the upper or the lower
4622/// 128 bits.
4623///
4624/// \headerfile <x86intrin.h>
4625///
4626/// \code
4627/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4628/// \endcode
4629///
4630/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4631///
4632/// \param V1
4633/// A 256-bit vector of [8 x float]. This vector is copied to the result
4634/// first, and then either the upper or the lower 128 bits of the result will
4635/// be replaced by the contents of \a V2.
4636/// \param V2
4637/// A 128-bit vector of [4 x float]. The contents of this parameter are
4638/// written to either the upper or the lower 128 bits of the result depending
4639/// on the value of parameter \a M.
4640/// \param M
4641/// An immediate integer. The least significant bit determines how the values
4642/// from the two parameters are interleaved: \n
4643/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4644/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4645/// result. \n
4646/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4647/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4648/// result.
4649/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4650#define _mm256_insertf128_ps(V1, V2, M) \
4651 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4652 (__v4sf)(__m128)(V2), (int)(M)))
4653
4654/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4655/// a 256-bit vector of [4 x double] given in the first parameter, and then
4656/// replacing either the upper or the lower 128 bits with the contents of a
4657/// 128-bit vector of [2 x double] in the second parameter.
4658///
4659/// The immediate integer parameter determines between the upper or the lower
4660/// 128 bits.
4661///
4662/// \headerfile <x86intrin.h>
4663///
4664/// \code
4665/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4666/// \endcode
4667///
4668/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4669///
4670/// \param V1
4671/// A 256-bit vector of [4 x double]. This vector is copied to the result
4672/// first, and then either the upper or the lower 128 bits of the result will
4673/// be replaced by the contents of \a V2.
4674/// \param V2
4675/// A 128-bit vector of [2 x double]. The contents of this parameter are
4676/// written to either the upper or the lower 128 bits of the result depending
4677/// on the value of parameter \a M.
4678/// \param M
4679/// An immediate integer. The least significant bit determines how the values
4680/// from the two parameters are interleaved: \n
4681/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4682/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4683/// result. \n
4684/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4685/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4686/// result.
4687/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4688#define _mm256_insertf128_pd(V1, V2, M) \
4689 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4690 (__v2df)(__m128d)(V2), (int)(M)))
4691
4692/// Constructs a new 256-bit integer vector by first duplicating a
4693/// 256-bit integer vector given in the first parameter, and then replacing
4694/// either the upper or the lower 128 bits with the contents of a 128-bit
4695/// integer vector in the second parameter.
4696///
4697/// The immediate integer parameter determines between the upper or the lower
4698/// 128 bits.
4699///
4700/// \headerfile <x86intrin.h>
4701///
4702/// \code
4703/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4704/// \endcode
4705///
4706/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4707///
4708/// \param V1
4709/// A 256-bit integer vector. This vector is copied to the result first, and
4710/// then either the upper or the lower 128 bits of the result will be
4711/// replaced by the contents of \a V2.
4712/// \param V2
4713/// A 128-bit integer vector. The contents of this parameter are written to
4714/// either the upper or the lower 128 bits of the result depending on the
4715/// value of parameter \a M.
4716/// \param M
4717/// An immediate integer. The least significant bit determines how the values
4718/// from the two parameters are interleaved: \n
4719/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4720/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4721/// result. \n
4722/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4723/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4724/// result.
4725/// \returns A 256-bit integer vector containing the interleaved values.
4726#define _mm256_insertf128_si256(V1, V2, M) \
4727 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4728 (__v4si)(__m128i)(V2), (int)(M)))
4729
4730/*
4731 Vector extract.
4732 We use macros rather than inlines because we only want to accept
4733 invocations where the immediate M is a constant expression.
4734*/
4735/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4736/// of [8 x float], as determined by the immediate integer parameter, and
4737/// returns the extracted bits as a 128-bit vector of [4 x float].
4738///
4739/// \headerfile <x86intrin.h>
4740///
4741/// \code
4742/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4743/// \endcode
4744///
4745/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4746///
4747/// \param V
4748/// A 256-bit vector of [8 x float].
4749/// \param M
4750/// An immediate integer. The least significant bit determines which bits are
4751/// extracted from the first parameter: \n
4752/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4753/// result. \n
4754/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4755/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4756#define _mm256_extractf128_ps(V, M) \
4757 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4758
4759/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4760/// of [4 x double], as determined by the immediate integer parameter, and
4761/// returns the extracted bits as a 128-bit vector of [2 x double].
4762///
4763/// \headerfile <x86intrin.h>
4764///
4765/// \code
4766/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4767/// \endcode
4768///
4769/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4770///
4771/// \param V
4772/// A 256-bit vector of [4 x double].
4773/// \param M
4774/// An immediate integer. The least significant bit determines which bits are
4775/// extracted from the first parameter: \n
4776/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4777/// result. \n
4778/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4779/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4780#define _mm256_extractf128_pd(V, M) \
4781 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4782
4783/// Extracts either the upper or the lower 128 bits from a 256-bit
4784/// integer vector, as determined by the immediate integer parameter, and
4785/// returns the extracted bits as a 128-bit integer vector.
4786///
4787/// \headerfile <x86intrin.h>
4788///
4789/// \code
4790/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4791/// \endcode
4792///
4793/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4794///
4795/// \param V
4796/// A 256-bit integer vector.
4797/// \param M
4798/// An immediate integer. The least significant bit determines which bits are
4799/// extracted from the first parameter: \n
4800/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4801/// result. \n
4802/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4803/// \returns A 128-bit integer vector containing the extracted bits.
4804#define _mm256_extractf128_si256(V, M) \
4805 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4806
4807/// Constructs a 256-bit floating-point vector of [8 x float] by
4808/// concatenating two 128-bit floating-point vectors of [4 x float].
4809///
4810/// \headerfile <x86intrin.h>
4811///
4812/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4813///
4814/// \param __hi
4815/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4816/// 128 bits of the result.
4817/// \param __lo
4818/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4819/// 128 bits of the result.
4820/// \returns A 256-bit floating-point vector of [8 x float] containing the
4821/// concatenated result.
4822static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4823_mm256_set_m128(__m128 __hi, __m128 __lo) {
4824 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4825}
4826
4827/// Constructs a 256-bit floating-point vector of [4 x double] by
4828/// concatenating two 128-bit floating-point vectors of [2 x double].
4829///
4830/// \headerfile <x86intrin.h>
4831///
4832/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4833///
4834/// \param __hi
4835/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4836/// 128 bits of the result.
4837/// \param __lo
4838/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4839/// 128 bits of the result.
4840/// \returns A 256-bit floating-point vector of [4 x double] containing the
4841/// concatenated result.
4842static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4843_mm256_set_m128d(__m128d __hi, __m128d __lo) {
4844 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4845}
4846
4847/// Constructs a 256-bit integer vector by concatenating two 128-bit
4848/// integer vectors.
4849///
4850/// \headerfile <x86intrin.h>
4851///
4852/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4853///
4854/// \param __hi
4855/// A 128-bit integer vector to be copied to the upper 128 bits of the
4856/// result.
4857/// \param __lo
4858/// A 128-bit integer vector to be copied to the lower 128 bits of the
4859/// result.
4860/// \returns A 256-bit integer vector containing the concatenated result.
4861static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4862_mm256_set_m128i(__m128i __hi, __m128i __lo) {
4863 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4864}
4865
4866/// Constructs a 256-bit floating-point vector of [8 x float] by
4867/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4868/// similar to _mm256_set_m128, but the order of the input parameters is
4869/// swapped.
4870///
4871/// \headerfile <x86intrin.h>
4872///
4873/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4874///
4875/// \param __lo
4876/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4877/// 128 bits of the result.
4878/// \param __hi
4879/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4880/// 128 bits of the result.
4881/// \returns A 256-bit floating-point vector of [8 x float] containing the
4882/// concatenated result.
4883static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4884_mm256_setr_m128(__m128 __lo, __m128 __hi) {
4885 return _mm256_set_m128(__hi, __lo);
4886}
4887
4888/// Constructs a 256-bit floating-point vector of [4 x double] by
4889/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4890/// similar to _mm256_set_m128d, but the order of the input parameters is
4891/// swapped.
4892///
4893/// \headerfile <x86intrin.h>
4894///
4895/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4896///
4897/// \param __lo
4898/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4899/// 128 bits of the result.
4900/// \param __hi
4901/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4902/// 128 bits of the result.
4903/// \returns A 256-bit floating-point vector of [4 x double] containing the
4904/// concatenated result.
4905static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4906_mm256_setr_m128d(__m128d __lo, __m128d __hi) {
4907 return (__m256d)_mm256_set_m128d(__hi, __lo);
4908}
4909
4910/// Constructs a 256-bit integer vector by concatenating two 128-bit
4911/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4912/// the input parameters is swapped.
4913///
4914/// \headerfile <x86intrin.h>
4915///
4916/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4917///
4918/// \param __lo
4919/// A 128-bit integer vector to be copied to the lower 128 bits of the
4920/// result.
4921/// \param __hi
4922/// A 128-bit integer vector to be copied to the upper 128 bits of the
4923/// result.
4924/// \returns A 256-bit integer vector containing the concatenated result.
4925static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4926_mm256_setr_m128i(__m128i __lo, __m128i __hi) {
4927 return (__m256i)_mm256_set_m128i(__hi, __lo);
4928}
4929
4930/* SIMD load ops (unaligned) */
4931/// Loads two 128-bit floating-point vectors of [4 x float] from
4932/// unaligned memory locations and constructs a 256-bit floating-point vector
4933/// of [8 x float] by concatenating the two 128-bit vectors.
4934///
4935/// \headerfile <x86intrin.h>
4936///
4937/// This intrinsic corresponds to load instructions followed by the
4938/// <c> VINSERTF128 </c> instruction.
4939///
4940/// \param __addr_hi
4941/// A pointer to a 128-bit memory location containing 4 consecutive
4942/// single-precision floating-point values. These values are to be copied to
4943/// bits[255:128] of the result. The address of the memory location does not
4944/// have to be aligned.
4945/// \param __addr_lo
4946/// A pointer to a 128-bit memory location containing 4 consecutive
4947/// single-precision floating-point values. These values are to be copied to
4948/// bits[127:0] of the result. The address of the memory location does not
4949/// have to be aligned.
4950/// \returns A 256-bit floating-point vector of [8 x float] containing the
4951/// concatenated result.
4952static __inline __m256 __DEFAULT_FN_ATTRS
4953_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4954{
4955 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4956}
4957
4958/// Loads two 128-bit floating-point vectors of [2 x double] from
4959/// unaligned memory locations and constructs a 256-bit floating-point vector
4960/// of [4 x double] by concatenating the two 128-bit vectors.
4961///
4962/// \headerfile <x86intrin.h>
4963///
4964/// This intrinsic corresponds to load instructions followed by the
4965/// <c> VINSERTF128 </c> instruction.
4966///
4967/// \param __addr_hi
4968/// A pointer to a 128-bit memory location containing two consecutive
4969/// double-precision floating-point values. These values are to be copied to
4970/// bits[255:128] of the result. The address of the memory location does not
4971/// have to be aligned.
4972/// \param __addr_lo
4973/// A pointer to a 128-bit memory location containing two consecutive
4974/// double-precision floating-point values. These values are to be copied to
4975/// bits[127:0] of the result. The address of the memory location does not
4976/// have to be aligned.
4977/// \returns A 256-bit floating-point vector of [4 x double] containing the
4978/// concatenated result.
4979static __inline __m256d __DEFAULT_FN_ATTRS
4980_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4981{
4982 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
4983}
4984
4985/// Loads two 128-bit integer vectors from unaligned memory locations and
4986/// constructs a 256-bit integer vector by concatenating the two 128-bit
4987/// vectors.
4988///
4989/// \headerfile <x86intrin.h>
4990///
4991/// This intrinsic corresponds to load instructions followed by the
4992/// <c> VINSERTF128 </c> instruction.
4993///
4994/// \param __addr_hi
4995/// A pointer to a 128-bit memory location containing a 128-bit integer
4996/// vector. This vector is to be copied to bits[255:128] of the result. The
4997/// address of the memory location does not have to be aligned.
4998/// \param __addr_lo
4999/// A pointer to a 128-bit memory location containing a 128-bit integer
5000/// vector. This vector is to be copied to bits[127:0] of the result. The
5001/// address of the memory location does not have to be aligned.
5002/// \returns A 256-bit integer vector containing the concatenated result.
5003static __inline __m256i __DEFAULT_FN_ATTRS
5004_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
5005{
5006 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
5007}
5008
5009/* SIMD store ops (unaligned) */
5010/// Stores the upper and lower 128 bits of a 256-bit floating-point
5011/// vector of [8 x float] into two different unaligned memory locations.
5012///
5013/// \headerfile <x86intrin.h>
5014///
5015/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5016/// store instructions.
5017///
5018/// \param __addr_hi
5019/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5020/// copied to this memory location. The address of this memory location does
5021/// not have to be aligned.
5022/// \param __addr_lo
5023/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5024/// copied to this memory location. The address of this memory location does
5025/// not have to be aligned.
5026/// \param __a
5027/// A 256-bit floating-point vector of [8 x float].
5028static __inline void __DEFAULT_FN_ATTRS
5029_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
5030{
5031 __m128 __v128;
5032
5033 __v128 = _mm256_castps256_ps128(__a);
5034 _mm_storeu_ps(__addr_lo, __v128);
5035 __v128 = _mm256_extractf128_ps(__a, 1);
5036 _mm_storeu_ps(__addr_hi, __v128);
5037}
5038
5039/// Stores the upper and lower 128 bits of a 256-bit floating-point
5040/// vector of [4 x double] into two different unaligned memory locations.
5041///
5042/// \headerfile <x86intrin.h>
5043///
5044/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5045/// store instructions.
5046///
5047/// \param __addr_hi
5048/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5049/// copied to this memory location. The address of this memory location does
5050/// not have to be aligned.
5051/// \param __addr_lo
5052/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5053/// copied to this memory location. The address of this memory location does
5054/// not have to be aligned.
5055/// \param __a
5056/// A 256-bit floating-point vector of [4 x double].
5057static __inline void __DEFAULT_FN_ATTRS
5058_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5059{
5060 __m128d __v128;
5061
5062 __v128 = _mm256_castpd256_pd128(__a);
5063 _mm_storeu_pd(__addr_lo, __v128);
5064 __v128 = _mm256_extractf128_pd(__a, 1);
5065 _mm_storeu_pd(__addr_hi, __v128);
5066}
5067
5068/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5069/// two different unaligned memory locations.
5070///
5071/// \headerfile <x86intrin.h>
5072///
5073/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5074/// store instructions.
5075///
5076/// \param __addr_hi
5077/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5078/// copied to this memory location. The address of this memory location does
5079/// not have to be aligned.
5080/// \param __addr_lo
5081/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5082/// copied to this memory location. The address of this memory location does
5083/// not have to be aligned.
5084/// \param __a
5085/// A 256-bit integer vector.
5086static __inline void __DEFAULT_FN_ATTRS
5087_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5088{
5089 __m128i __v128;
5090
5091 __v128 = _mm256_castsi256_si128(__a);
5092 _mm_storeu_si128(__addr_lo, __v128);
5093 __v128 = _mm256_extractf128_si256(__a, 1);
5094 _mm_storeu_si128(__addr_hi, __v128);
5095}
5096
5097#undef __DEFAULT_FN_ATTRS
5098#undef __DEFAULT_FN_ATTRS_CONSTEXPR
5099#undef __DEFAULT_FN_ATTRS128
5100#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5101
5102#endif /* __AVXINTRIN_H */
__device__ _Float16
#define __DEFAULT_FN_ATTRS
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3046
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition avxintrin.h:3090
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:744
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition avxintrin.h:3296
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2918
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition avxintrin.h:3592
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition avxintrin.h:3110
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4573
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition avxintrin.h:2279
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition avxintrin.h:3260
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition avxintrin.h:3316
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition avxintrin.h:4953
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:356
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3406
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition avxintrin.h:581
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition avxintrin.h:3752
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4590
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition avxintrin.h:116
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:390
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition avxintrin.h:2964
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition avxintrin.h:2204
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition avxintrin.h:3641
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition avxintrin.h:306
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2866
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition avxintrin.h:3987
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4884
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3431
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3382
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition avxintrin.h:982
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition avxintrin.h:4371
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition avxintrin.h:188
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4926
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition avxintrin.h:3354
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition avxintrin.h:4756
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition avxintrin.h:4804
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition avxintrin.h:3203
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2945
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition avxintrin.h:4422
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition avxintrin.h:4354
static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition avxintrin.h:2346
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition avxintrin.h:3681
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition avxintrin.h:2371
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4906
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition avxintrin.h:132
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition avxintrin.h:1406
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition avxintrin.h:3572
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition avxintrin.h:891
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition avxintrin.h:3628
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:373
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition avxintrin.h:3800
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition avxintrin.h:2189
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition avxintrin.h:602
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition avxintrin.h:3504
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4510
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition avxintrin.h:4204
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2514
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into four signed truncated (rounded toward zero) 32-bit int...
Definition avxintrin.h:2259
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition avxintrin.h:3654
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition avxintrin.h:2239
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition avxintrin.h:2223
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition avxintrin.h:4223
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2488
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2602
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition avxintrin.h:288
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2836
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3146
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition avxintrin.h:2175
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition avxintrin.h:2330
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition avxintrin.h:4780
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into eight signed truncated (rounded toward zero) 32-bit integers re...
Definition avxintrin.h:2299
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4532
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3068
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition avxintrin.h:246
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition avxintrin.h:4325
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2778
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition avxintrin.h:4242
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition avxintrin.h:3883
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition avxintrin.h:4980
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition avxintrin.h:82
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:674
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2690
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition avxintrin.h:5058
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition avxintrin.h:339
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition avxintrin.h:151
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:721
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition avxintrin.h:170
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2719
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition avxintrin.h:4185
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition avxintrin.h:4299
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition avxintrin.h:3528
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition avxintrin.h:3480
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition avxintrin.h:4152
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition avxintrin.h:4439
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3163
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition avxintrin.h:3720
static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition avxintrin.h:4456
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2892
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4607
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition avxintrin.h:98
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:698
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:767
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2748
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition avxintrin.h:4067
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition avxintrin.h:2396
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition avxintrin.h:2418
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition avxintrin.h:5087
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2543
static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition avxintrin.h:4473
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:620
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition avxintrin.h:4405
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:656
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:638
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition avxintrin.h:1433
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition avxintrin.h:5004
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition avxintrin.h:797
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition avxintrin.h:4313
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition avxintrin.h:3918
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3455
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition avxintrin.h:322
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition avxintrin.h:3613
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition avxintrin.h:560
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3219
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition avxintrin.h:3337
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition avxintrin.h:267
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4823
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition avxintrin.h:4388
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition avxintrin.h:4337
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4553
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition avxintrin.h:204
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3183
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3024
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2660
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition avxintrin.h:4019
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition avxintrin.h:2982
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition avxintrin.h:2462
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition avxintrin.h:2441
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition avxintrin.h:4260
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition avxintrin.h:836
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition avxintrin.h:4278
static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition avxintrin.h:4489
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition avxintrin.h:225
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2631
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3240
static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition avxintrin.h:2315
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4862
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition avxintrin.h:3278
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition avxintrin.h:3947
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition avxintrin.h:5029
double __v4df __attribute__((__vector_size__(32)))
Definition avxintrin.h:17
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2572
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2807
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition avxintrin.h:3552
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4843
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3130
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition avxintrin.h:542
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition emmintrin.h:1619
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3878
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition emmintrin.h:3456
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition emmintrin.h:1980
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition emmintrin.h:3909
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:2100
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2021
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1863