clang 23.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
55 __min_vector_width__(256)))
56#define __DEFAULT_FN_ATTRS128 \
57 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
58 __min_vector_width__(128)))
59
60#if defined(__cplusplus) && (__cplusplus >= 201103L)
61#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
62#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
63#else
64#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
65#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
66#endif
67
68/* Arithmetic */
69/// Adds two 256-bit vectors of [4 x double].
70///
71/// \headerfile <x86intrin.h>
72///
73/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
74///
75/// \param __a
76/// A 256-bit vector of [4 x double] containing one of the source operands.
77/// \param __b
78/// A 256-bit vector of [4 x double] containing one of the source operands.
79/// \returns A 256-bit vector of [4 x double] containing the sums of both
80/// operands.
81static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
82_mm256_add_pd(__m256d __a, __m256d __b) {
83 return (__m256d)((__v4df)__a+(__v4df)__b);
84}
85
86/// Adds two 256-bit vectors of [8 x float].
87///
88/// \headerfile <x86intrin.h>
89///
90/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
91///
92/// \param __a
93/// A 256-bit vector of [8 x float] containing one of the source operands.
94/// \param __b
95/// A 256-bit vector of [8 x float] containing one of the source operands.
96/// \returns A 256-bit vector of [8 x float] containing the sums of both
97/// operands.
98static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a,
99 __m256 __b) {
100 return (__m256)((__v8sf)__a+(__v8sf)__b);
101}
102
103/// Subtracts two 256-bit vectors of [4 x double].
104///
105/// \headerfile <x86intrin.h>
106///
107/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
108///
109/// \param __a
110/// A 256-bit vector of [4 x double] containing the minuend.
111/// \param __b
112/// A 256-bit vector of [4 x double] containing the subtrahend.
113/// \returns A 256-bit vector of [4 x double] containing the differences between
114/// both operands.
115static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
116_mm256_sub_pd(__m256d __a, __m256d __b) {
117 return (__m256d)((__v4df)__a-(__v4df)__b);
118}
119
120/// Subtracts two 256-bit vectors of [8 x float].
121///
122/// \headerfile <x86intrin.h>
123///
124/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
125///
126/// \param __a
127/// A 256-bit vector of [8 x float] containing the minuend.
128/// \param __b
129/// A 256-bit vector of [8 x float] containing the subtrahend.
130/// \returns A 256-bit vector of [8 x float] containing the differences between
131/// both operands.
132static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a,
133 __m256 __b) {
134 return (__m256)((__v8sf)__a-(__v8sf)__b);
135}
136
137/// Adds the even-indexed values and subtracts the odd-indexed values of
138/// two 256-bit vectors of [4 x double].
139///
140/// \headerfile <x86intrin.h>
141///
142/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
143///
144/// \param __a
145/// A 256-bit vector of [4 x double] containing the left source operand.
146/// \param __b
147/// A 256-bit vector of [4 x double] containing the right source operand.
148/// \returns A 256-bit vector of [4 x double] containing the alternating sums
149/// and differences between both operands.
150static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
151_mm256_addsub_pd(__m256d __a, __m256d __b) {
152 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
153}
154
155/// Adds the even-indexed values and subtracts the odd-indexed values of
156/// two 256-bit vectors of [8 x float].
157///
158/// \headerfile <x86intrin.h>
159///
160/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
161///
162/// \param __a
163/// A 256-bit vector of [8 x float] containing the left source operand.
164/// \param __b
165/// A 256-bit vector of [8 x float] containing the right source operand.
166/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
167/// differences between both operands.
168static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
169_mm256_addsub_ps(__m256 __a, __m256 __b) {
170 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
171}
172
173/// Divides two 256-bit vectors of [4 x double].
174///
175/// \headerfile <x86intrin.h>
176///
177/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
178///
179/// \param __a
180/// A 256-bit vector of [4 x double] containing the dividend.
181/// \param __b
182/// A 256-bit vector of [4 x double] containing the divisor.
183/// \returns A 256-bit vector of [4 x double] containing the quotients of both
184/// operands.
185static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
186_mm256_div_pd(__m256d __a, __m256d __b) {
187 return (__m256d)((__v4df)__a/(__v4df)__b);
188}
189
190/// Divides two 256-bit vectors of [8 x float].
191///
192/// \headerfile <x86intrin.h>
193///
194/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
195///
196/// \param __a
197/// A 256-bit vector of [8 x float] containing the dividend.
198/// \param __b
199/// A 256-bit vector of [8 x float] containing the divisor.
200/// \returns A 256-bit vector of [8 x float] containing the quotients of both
201/// operands.
202static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a,
203 __m256 __b) {
204 return (__m256)((__v8sf)__a/(__v8sf)__b);
205}
206
207/// Compares two 256-bit vectors of [4 x double] and returns the greater
208/// of each pair of values.
209///
210/// If either value in a comparison is NaN, returns the value from \a __b.
211///
212/// \headerfile <x86intrin.h>
213///
214/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
215///
216/// \param __a
217/// A 256-bit vector of [4 x double] containing one of the operands.
218/// \param __b
219/// A 256-bit vector of [4 x double] containing one of the operands.
220/// \returns A 256-bit vector of [4 x double] containing the maximum values
221/// between both operands.
222static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
223_mm256_max_pd(__m256d __a, __m256d __b) {
224 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
225}
226
227/// Compares two 256-bit vectors of [8 x float] and returns the greater
228/// of each pair of values.
229///
230/// If either value in a comparison is NaN, returns the value from \a __b.
231///
232/// \headerfile <x86intrin.h>
233///
234/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
235///
236/// \param __a
237/// A 256-bit vector of [8 x float] containing one of the operands.
238/// \param __b
239/// A 256-bit vector of [8 x float] containing one of the operands.
240/// \returns A 256-bit vector of [8 x float] containing the maximum values
241/// between both operands.
242static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_max_ps(__m256 __a,
243 __m256 __b) {
244 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
245}
246
247/// Compares two 256-bit vectors of [4 x double] and returns the lesser
248/// of each pair of values.
249///
250/// If either value in a comparison is NaN, returns the value from \a __b.
251///
252/// \headerfile <x86intrin.h>
253///
254/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
255///
256/// \param __a
257/// A 256-bit vector of [4 x double] containing one of the operands.
258/// \param __b
259/// A 256-bit vector of [4 x double] containing one of the operands.
260/// \returns A 256-bit vector of [4 x double] containing the minimum values
261/// between both operands.
262static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
263_mm256_min_pd(__m256d __a, __m256d __b) {
264 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
265}
266
267/// Compares two 256-bit vectors of [8 x float] and returns the lesser
268/// of each pair of values.
269///
270/// If either value in a comparison is NaN, returns the value from \a __b.
271///
272/// \headerfile <x86intrin.h>
273///
274/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
275///
276/// \param __a
277/// A 256-bit vector of [8 x float] containing one of the operands.
278/// \param __b
279/// A 256-bit vector of [8 x float] containing one of the operands.
280/// \returns A 256-bit vector of [8 x float] containing the minimum values
281/// between both operands.
282static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_min_ps(__m256 __a,
283 __m256 __b) {
284 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
285}
286
287/// Multiplies two 256-bit vectors of [4 x double].
288///
289/// \headerfile <x86intrin.h>
290///
291/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
292///
293/// \param __a
294/// A 256-bit vector of [4 x double] containing one of the operands.
295/// \param __b
296/// A 256-bit vector of [4 x double] containing one of the operands.
297/// \returns A 256-bit vector of [4 x double] containing the products of both
298/// operands.
299static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
300_mm256_mul_pd(__m256d __a, __m256d __b) {
301 return (__m256d)((__v4df)__a * (__v4df)__b);
302}
303
304/// Multiplies two 256-bit vectors of [8 x float].
305///
306/// \headerfile <x86intrin.h>
307///
308/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
309///
310/// \param __a
311/// A 256-bit vector of [8 x float] containing one of the operands.
312/// \param __b
313/// A 256-bit vector of [8 x float] containing one of the operands.
314/// \returns A 256-bit vector of [8 x float] containing the products of both
315/// operands.
316static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
317 __m256 __b) {
318 return (__m256)((__v8sf)__a * (__v8sf)__b);
319}
320
321/// Calculates the square roots of the values in a 256-bit vector of
322/// [4 x double].
323///
324/// \headerfile <x86intrin.h>
325///
326/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
327///
328/// \param __a
329/// A 256-bit vector of [4 x double].
330/// \returns A 256-bit vector of [4 x double] containing the square roots of the
331/// values in the operand.
332static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
333 return __builtin_elementwise_sqrt(__a);
334}
335
336/// Calculates the square roots of the values in a 256-bit vector of
337/// [8 x float].
338///
339/// \headerfile <x86intrin.h>
340///
341/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
342///
343/// \param __a
344/// A 256-bit vector of [8 x float].
345/// \returns A 256-bit vector of [8 x float] containing the square roots of the
346/// values in the operand.
347static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
348 return __builtin_elementwise_sqrt(__a);
349}
350
351/// Calculates the reciprocal square roots of the values in a 256-bit
352/// vector of [8 x float].
353///
354/// \headerfile <x86intrin.h>
355///
356/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
357///
358/// \param __a
359/// A 256-bit vector of [8 x float].
360/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
361/// roots of the values in the operand.
362static __inline __m256 __DEFAULT_FN_ATTRS
364{
365 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
366}
367
368/// Calculates the reciprocals of the values in a 256-bit vector of
369/// [8 x float].
370///
371/// \headerfile <x86intrin.h>
372///
373/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
374///
375/// \param __a
376/// A 256-bit vector of [8 x float].
377/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
378/// values in the operand.
379static __inline __m256 __DEFAULT_FN_ATTRS
381{
382 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
383}
384
385/// Rounds the values in a 256-bit vector of [4 x double] as specified
386/// by the byte operand. The source values are rounded to integer values and
387/// returned as 64-bit double-precision floating-point values.
388///
389/// \headerfile <x86intrin.h>
390///
391/// \code
392/// __m256d _mm256_round_pd(__m256d V, const int M);
393/// \endcode
394///
395/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
396///
397/// \param V
398/// A 256-bit vector of [4 x double].
399/// \param M
400/// An integer value that specifies the rounding operation. \n
401/// Bits [7:4] are reserved. \n
402/// Bit [3] is a precision exception value: \n
403/// 0: A normal PE exception is used. \n
404/// 1: The PE field is not updated. \n
405/// Bit [2] is the rounding control source: \n
406/// 0: Use bits [1:0] of \a M. \n
407/// 1: Use the current MXCSR setting. \n
408/// Bits [1:0] contain the rounding control definition: \n
409/// 00: Nearest. \n
410/// 01: Downward (toward negative infinity). \n
411/// 10: Upward (toward positive infinity). \n
412/// 11: Truncated.
413/// \returns A 256-bit vector of [4 x double] containing the rounded values.
414#define _mm256_round_pd(V, M) \
415 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
416
417/// Rounds the values stored in a 256-bit vector of [8 x float] as
418/// specified by the byte operand. The source values are rounded to integer
419/// values and returned as floating-point values.
420///
421/// \headerfile <x86intrin.h>
422///
423/// \code
424/// __m256 _mm256_round_ps(__m256 V, const int M);
425/// \endcode
426///
427/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
428///
429/// \param V
430/// A 256-bit vector of [8 x float].
431/// \param M
432/// An integer value that specifies the rounding operation. \n
433/// Bits [7:4] are reserved. \n
434/// Bit [3] is a precision exception value: \n
435/// 0: A normal PE exception is used. \n
436/// 1: The PE field is not updated. \n
437/// Bit [2] is the rounding control source: \n
438/// 0: Use bits [1:0] of \a M. \n
439/// 1: Use the current MXCSR setting. \n
440/// Bits [1:0] contain the rounding control definition: \n
441/// 00: Nearest. \n
442/// 01: Downward (toward negative infinity). \n
443/// 10: Upward (toward positive infinity). \n
444/// 11: Truncated.
445/// \returns A 256-bit vector of [8 x float] containing the rounded values.
446#define _mm256_round_ps(V, M) \
447 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
448
449/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
450/// source values are rounded up to integer values and returned as 64-bit
451/// double-precision floating-point values.
452///
453/// \headerfile <x86intrin.h>
454///
455/// \code
456/// __m256d _mm256_ceil_pd(__m256d V);
457/// \endcode
458///
459/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
460///
461/// \param V
462/// A 256-bit vector of [4 x double].
463/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
464#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
465
466/// Rounds down the values stored in a 256-bit vector of [4 x double].
467/// The source values are rounded down to integer values and returned as
468/// 64-bit double-precision floating-point values.
469///
470/// \headerfile <x86intrin.h>
471///
472/// \code
473/// __m256d _mm256_floor_pd(__m256d V);
474/// \endcode
475///
476/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
477///
478/// \param V
479/// A 256-bit vector of [4 x double].
480/// \returns A 256-bit vector of [4 x double] containing the rounded down
481/// values.
482#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
483
484/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
485/// source values are rounded up to integer values and returned as
486/// floating-point values.
487///
488/// \headerfile <x86intrin.h>
489///
490/// \code
491/// __m256 _mm256_ceil_ps(__m256 V);
492/// \endcode
493///
494/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
495///
496/// \param V
497/// A 256-bit vector of [8 x float].
498/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
499#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
500
501/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
502/// source values are rounded down to integer values and returned as
503/// floating-point values.
504///
505/// \headerfile <x86intrin.h>
506///
507/// \code
508/// __m256 _mm256_floor_ps(__m256 V);
509/// \endcode
510///
511/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
512///
513/// \param V
514/// A 256-bit vector of [8 x float].
515/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
516#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
517
518/* Logical */
519/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
520///
521/// \headerfile <x86intrin.h>
522///
523/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
524///
525/// \param __a
526/// A 256-bit vector of [4 x double] containing one of the source operands.
527/// \param __b
528/// A 256-bit vector of [4 x double] containing one of the source operands.
529/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
530/// values between both operands.
531static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
532_mm256_and_pd(__m256d __a, __m256d __b)
533{
534 return (__m256d)((__v4du)__a & (__v4du)__b);
535}
536
537/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
538///
539/// \headerfile <x86intrin.h>
540///
541/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
542///
543/// \param __a
544/// A 256-bit vector of [8 x float] containing one of the source operands.
545/// \param __b
546/// A 256-bit vector of [8 x float] containing one of the source operands.
547/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
548/// values between both operands.
549static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
550_mm256_and_ps(__m256 __a, __m256 __b)
551{
552 return (__m256)((__v8su)__a & (__v8su)__b);
553}
554
555/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
556/// the one's complement of the values contained in the first source operand.
557///
558/// \headerfile <x86intrin.h>
559///
560/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
561///
562/// \param __a
563/// A 256-bit vector of [4 x double] containing the left source operand. The
564/// one's complement of this value is used in the bitwise AND.
565/// \param __b
566/// A 256-bit vector of [4 x double] containing the right source operand.
567/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
568/// values of the second operand and the one's complement of the first
569/// operand.
570static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
571_mm256_andnot_pd(__m256d __a, __m256d __b)
572{
573 return (__m256d)(~(__v4du)__a & (__v4du)__b);
574}
575
576/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
577/// the one's complement of the values contained in the first source operand.
578///
579/// \headerfile <x86intrin.h>
580///
581/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
582///
583/// \param __a
584/// A 256-bit vector of [8 x float] containing the left source operand. The
585/// one's complement of this value is used in the bitwise AND.
586/// \param __b
587/// A 256-bit vector of [8 x float] containing the right source operand.
588/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
589/// values of the second operand and the one's complement of the first
590/// operand.
591static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
592_mm256_andnot_ps(__m256 __a, __m256 __b)
593{
594 return (__m256)(~(__v8su)__a & (__v8su)__b);
595}
596
597/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
598///
599/// \headerfile <x86intrin.h>
600///
601/// This intrinsic corresponds to the <c> VORPD </c> instruction.
602///
603/// \param __a
604/// A 256-bit vector of [4 x double] containing one of the source operands.
605/// \param __b
606/// A 256-bit vector of [4 x double] containing one of the source operands.
607/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
608/// values between both operands.
609static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
610_mm256_or_pd(__m256d __a, __m256d __b)
611{
612 return (__m256d)((__v4du)__a | (__v4du)__b);
613}
614
615/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
616///
617/// \headerfile <x86intrin.h>
618///
619/// This intrinsic corresponds to the <c> VORPS </c> instruction.
620///
621/// \param __a
622/// A 256-bit vector of [8 x float] containing one of the source operands.
623/// \param __b
624/// A 256-bit vector of [8 x float] containing one of the source operands.
625/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
626/// values between both operands.
627static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
628_mm256_or_ps(__m256 __a, __m256 __b)
629{
630 return (__m256)((__v8su)__a | (__v8su)__b);
631}
632
633/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
634///
635/// \headerfile <x86intrin.h>
636///
637/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
638///
639/// \param __a
640/// A 256-bit vector of [4 x double] containing one of the source operands.
641/// \param __b
642/// A 256-bit vector of [4 x double] containing one of the source operands.
643/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
644/// values between both operands.
645static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
646_mm256_xor_pd(__m256d __a, __m256d __b)
647{
648 return (__m256d)((__v4du)__a ^ (__v4du)__b);
649}
650
651/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
652///
653/// \headerfile <x86intrin.h>
654///
655/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
656///
657/// \param __a
658/// A 256-bit vector of [8 x float] containing one of the source operands.
659/// \param __b
660/// A 256-bit vector of [8 x float] containing one of the source operands.
661/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
662/// values between both operands.
663static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
664_mm256_xor_ps(__m256 __a, __m256 __b)
665{
666 return (__m256)((__v8su)__a ^ (__v8su)__b);
667}
668
669/* Horizontal arithmetic */
670/// Horizontally adds the adjacent pairs of values contained in two
671/// 256-bit vectors of [4 x double].
672///
673/// \headerfile <x86intrin.h>
674///
675/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
676///
677/// \param __a
678/// A 256-bit vector of [4 x double] containing one of the source operands.
679/// The horizontal sums of the values are returned in the even-indexed
680/// elements of a vector of [4 x double].
681/// \param __b
682/// A 256-bit vector of [4 x double] containing one of the source operands.
683/// The horizontal sums of the values are returned in the odd-indexed
684/// elements of a vector of [4 x double].
685/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
686/// both operands.
687static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
688_mm256_hadd_pd(__m256d __a, __m256d __b) {
689 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
690}
691
692/// Horizontally adds the adjacent pairs of values contained in two
693/// 256-bit vectors of [8 x float].
694///
695/// \headerfile <x86intrin.h>
696///
697/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
698///
699/// \param __a
700/// A 256-bit vector of [8 x float] containing one of the source operands.
701/// The horizontal sums of the values are returned in the elements with
702/// index 0, 1, 4, 5 of a vector of [8 x float].
703/// \param __b
704/// A 256-bit vector of [8 x float] containing one of the source operands.
705/// The horizontal sums of the values are returned in the elements with
706/// index 2, 3, 6, 7 of a vector of [8 x float].
707/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
708/// both operands.
709static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a,
710 __m256 __b) {
711 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
712}
713
714/// Horizontally subtracts the adjacent pairs of values contained in two
715/// 256-bit vectors of [4 x double].
716///
717/// \headerfile <x86intrin.h>
718///
719/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
720///
721/// \param __a
722/// A 256-bit vector of [4 x double] containing one of the source operands.
723/// The horizontal differences between the values are returned in the
724/// even-indexed elements of a vector of [4 x double].
725/// \param __b
726/// A 256-bit vector of [4 x double] containing one of the source operands.
727/// The horizontal differences between the values are returned in the
728/// odd-indexed elements of a vector of [4 x double].
729/// \returns A 256-bit vector of [4 x double] containing the horizontal
730/// differences of both operands.
731static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
732_mm256_hsub_pd(__m256d __a, __m256d __b) {
733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
734}
735
736/// Horizontally subtracts the adjacent pairs of values contained in two
737/// 256-bit vectors of [8 x float].
738///
739/// \headerfile <x86intrin.h>
740///
741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
742///
743/// \param __a
744/// A 256-bit vector of [8 x float] containing one of the source operands.
745/// The horizontal differences between the values are returned in the
746/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
747/// \param __b
748/// A 256-bit vector of [8 x float] containing one of the source operands.
749/// The horizontal differences between the values are returned in the
750/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
751/// \returns A 256-bit vector of [8 x float] containing the horizontal
752/// differences of both operands.
753static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a,
754 __m256 __b) {
755 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
756}
757
758/* Vector permutations */
759/// Copies the values in a 128-bit vector of [2 x double] as specified
760/// by the 128-bit integer vector operand.
761///
762/// \headerfile <x86intrin.h>
763///
764/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
765///
766/// \param __a
767/// A 128-bit vector of [2 x double].
768/// \param __c
769/// A 128-bit integer vector operand specifying how the values are to be
770/// copied. \n
771/// Bit [1]: \n
772/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
773/// vector. \n
774/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
775/// returned vector. \n
776/// Bit [65]: \n
777/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
778/// returned vector. \n
779/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
780/// returned vector.
781/// \returns A 128-bit vector of [2 x double] containing the copied values.
782static __inline __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
783_mm_permutevar_pd(__m128d __a, __m128i __c) {
784 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
785}
786
787/// Copies the values in a 256-bit vector of [4 x double] as specified
788/// by the 256-bit integer vector operand.
789///
790/// \headerfile <x86intrin.h>
791///
792/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
793///
794/// \param __a
795/// A 256-bit vector of [4 x double].
796/// \param __c
797/// A 256-bit integer vector operand specifying how the values are to be
798/// copied. \n
799/// Bit [1]: \n
800/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
801/// vector. \n
802/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
803/// returned vector. \n
804/// Bit [65]: \n
805/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
806/// returned vector. \n
807/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
808/// returned vector. \n
809/// Bit [129]: \n
810/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
811/// returned vector. \n
812/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
813/// returned vector. \n
814/// Bit [193]: \n
815/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
816/// returned vector. \n
817/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
818/// returned vector.
819/// \returns A 256-bit vector of [4 x double] containing the copied values.
820static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
821_mm256_permutevar_pd(__m256d __a, __m256i __c) {
822 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
823}
824
825/// Copies the values stored in a 128-bit vector of [4 x float] as
826/// specified by the 128-bit integer vector operand.
827///
828/// \headerfile <x86intrin.h>
829///
830/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
831///
832/// \param __a
833/// A 128-bit vector of [4 x float].
834/// \param __c
835/// A 128-bit integer vector operand specifying how the values are to be
836/// copied. \n
837/// Bits [1:0]: \n
838/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
839/// returned vector. \n
840/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
841/// returned vector. \n
842/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
843/// returned vector. \n
844/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
845/// returned vector. \n
846/// Bits [33:32]: \n
847/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
848/// returned vector. \n
849/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
850/// returned vector. \n
851/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
852/// returned vector. \n
853/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
854/// returned vector. \n
855/// Bits [65:64]: \n
856/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
857/// returned vector. \n
858/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
859/// returned vector. \n
860/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
861/// returned vector. \n
862/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
863/// returned vector. \n
864/// Bits [97:96]: \n
865/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
866/// returned vector. \n
867/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
868/// returned vector. \n
869/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
870/// returned vector. \n
871/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
872/// returned vector.
873/// \returns A 128-bit vector of [4 x float] containing the copied values.
874static __inline __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
875_mm_permutevar_ps(__m128 __a, __m128i __c) {
876 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
877}
878
879/// Copies the values stored in a 256-bit vector of [8 x float] as
880/// specified by the 256-bit integer vector operand.
881///
882/// \headerfile <x86intrin.h>
883///
884/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
885///
886/// \param __a
887/// A 256-bit vector of [8 x float].
888/// \param __c
889/// A 256-bit integer vector operand specifying how the values are to be
890/// copied. \n
891/// Bits [1:0]: \n
892/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
893/// returned vector. \n
894/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
895/// returned vector. \n
896/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
897/// returned vector. \n
898/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
899/// returned vector. \n
900/// Bits [33:32]: \n
901/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
902/// returned vector. \n
903/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
904/// returned vector. \n
905/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
906/// returned vector. \n
907/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
908/// returned vector. \n
909/// Bits [65:64]: \n
910/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
911/// returned vector. \n
912/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
913/// returned vector. \n
914/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
915/// returned vector. \n
916/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
917/// returned vector. \n
918/// Bits [97:96]: \n
919/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
920/// returned vector. \n
921/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
922/// returned vector. \n
923/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
924/// returned vector. \n
925/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
926/// returned vector. \n
927/// Bits [129:128]: \n
928/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
929/// returned vector. \n
930/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
931/// returned vector. \n
932/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
933/// returned vector. \n
934/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
935/// returned vector. \n
936/// Bits [161:160]: \n
937/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
938/// returned vector. \n
939/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
940/// returned vector. \n
941/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
942/// returned vector. \n
943/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
944/// returned vector. \n
945/// Bits [193:192]: \n
946/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
947/// returned vector. \n
948/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
949/// returned vector. \n
950/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
951/// returned vector. \n
952/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
953/// returned vector. \n
954/// Bits [225:224]: \n
955/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
956/// returned vector. \n
957/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
958/// returned vector. \n
959/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
960/// returned vector. \n
961/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
962/// returned vector.
963/// \returns A 256-bit vector of [8 x float] containing the copied values.
964static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
965_mm256_permutevar_ps(__m256 __a, __m256i __c) {
966 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
967}
968
969/// Copies the values in a 128-bit vector of [2 x double] as specified
970/// by the immediate integer operand.
971///
972/// \headerfile <x86intrin.h>
973///
974/// \code
975/// __m128d _mm_permute_pd(__m128d A, const int C);
976/// \endcode
977///
978/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
979///
980/// \param A
981/// A 128-bit vector of [2 x double].
982/// \param C
983/// An immediate integer operand specifying how the values are to be
984/// copied. \n
985/// Bit [0]: \n
986/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
987/// vector. \n
988/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
989/// returned vector. \n
990/// Bit [1]: \n
991/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
992/// returned vector. \n
993/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
994/// returned vector.
995/// \returns A 128-bit vector of [2 x double] containing the copied values.
996#define _mm_permute_pd(A, C) \
997 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
998
999/// Copies the values in a 256-bit vector of [4 x double] as specified by
1000/// the immediate integer operand.
1001///
1002/// \headerfile <x86intrin.h>
1003///
1004/// \code
1005/// __m256d _mm256_permute_pd(__m256d A, const int C);
1006/// \endcode
1007///
1008/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1009///
1010/// \param A
1011/// A 256-bit vector of [4 x double].
1012/// \param C
1013/// An immediate integer operand specifying how the values are to be
1014/// copied. \n
1015/// Bit [0]: \n
1016/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1017/// vector. \n
1018/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1019/// returned vector. \n
1020/// Bit [1]: \n
1021/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1022/// returned vector. \n
1023/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1024/// returned vector. \n
1025/// Bit [2]: \n
1026/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1027/// returned vector. \n
1028/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1029/// returned vector. \n
1030/// Bit [3]: \n
1031/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1032/// returned vector. \n
1033/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1034/// returned vector.
1035/// \returns A 256-bit vector of [4 x double] containing the copied values.
1036#define _mm256_permute_pd(A, C) \
1037 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1038
1039/// Copies the values in a 128-bit vector of [4 x float] as specified by
1040/// the immediate integer operand.
1041///
1042/// \headerfile <x86intrin.h>
1043///
1044/// \code
1045/// __m128 _mm_permute_ps(__m128 A, const int C);
1046/// \endcode
1047///
1048/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1049///
1050/// \param A
1051/// A 128-bit vector of [4 x float].
1052/// \param C
1053/// An immediate integer operand specifying how the values are to be
1054/// copied. \n
1055/// Bits [1:0]: \n
1056/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1057/// returned vector. \n
1058/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1059/// returned vector. \n
1060/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1061/// returned vector. \n
1062/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1063/// returned vector. \n
1064/// Bits [3:2]: \n
1065/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1066/// returned vector. \n
1067/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1068/// returned vector. \n
1069/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1070/// returned vector. \n
1071/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1072/// returned vector. \n
1073/// Bits [5:4]: \n
1074/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1075/// returned vector. \n
1076/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1077/// returned vector. \n
1078/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1079/// returned vector. \n
1080/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1081/// returned vector. \n
1082/// Bits [7:6]: \n
1083/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1084/// returned vector. \n
1085/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1086/// returned vector. \n
1087/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1088/// returned vector. \n
1089/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1090/// returned vector.
1091/// \returns A 128-bit vector of [4 x float] containing the copied values.
1092#define _mm_permute_ps(A, C) \
1093 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1094
1095/// Copies the values in a 256-bit vector of [8 x float] as specified by
1096/// the immediate integer operand.
1097///
1098/// \headerfile <x86intrin.h>
1099///
1100/// \code
1101/// __m256 _mm256_permute_ps(__m256 A, const int C);
1102/// \endcode
1103///
1104/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1105///
1106/// \param A
1107/// A 256-bit vector of [8 x float].
1108/// \param C
1109/// An immediate integer operand specifying how the values are to be
1110/// copied. \n
1111/// Bits [1:0]: \n
1112/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1113/// returned vector. \n
1114/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1115/// returned vector. \n
1116/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1117/// returned vector. \n
1118/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1119/// returned vector. \n
1120/// Bits [3:2]: \n
1121/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1122/// returned vector. \n
1123/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1124/// returned vector. \n
1125/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1126/// returned vector. \n
1127/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1128/// returned vector. \n
1129/// Bits [5:4]: \n
1130/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1131/// returned vector. \n
1132/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1133/// returned vector. \n
1134/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1135/// returned vector. \n
1136/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1137/// returned vector. \n
1138/// Bits [7:6]: \n
1139/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1140/// returned vector. \n
1141/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1142/// returned vector. \n
1143/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1144/// returned vector. \n
1145/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1146/// returned vector. \n
1147/// Bits [1:0]: \n
1148/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1149/// returned vector. \n
1150/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1151/// returned vector. \n
1152/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1153/// returned vector. \n
1154/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1155/// returned vector. \n
1156/// Bits [3:2]: \n
1157/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1158/// returned vector. \n
1159/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1160/// returned vector. \n
1161/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1162/// returned vector. \n
1163/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1164/// returned vector. \n
1165/// Bits [5:4]: \n
1166/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1167/// returned vector. \n
1168/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1169/// returned vector. \n
1170/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1171/// returned vector. \n
1172/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1173/// returned vector. \n
1174/// Bits [7:6]: \n
1175/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1176/// returned vector. \n
1177/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1178/// returned vector. \n
1179/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1180/// returned vector. \n
1181/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1182/// returned vector.
1183/// \returns A 256-bit vector of [8 x float] containing the copied values.
1184#define _mm256_permute_ps(A, C) \
1185 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1186
1187/// Permutes 128-bit data values stored in two 256-bit vectors of
1188/// [4 x double], as specified by the immediate integer operand.
1189///
1190/// \headerfile <x86intrin.h>
1191///
1192/// \code
1193/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1194/// \endcode
1195///
1196/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1197///
1198/// \param V1
1199/// A 256-bit vector of [4 x double].
1200/// \param V2
1201/// A 256-bit vector of [4 x double.
1202/// \param M
1203/// An immediate integer operand specifying how the values are to be
1204/// permuted. \n
1205/// Bits [1:0]: \n
1206/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1207/// destination. \n
1208/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1209/// destination. \n
1210/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1211/// destination. \n
1212/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1213/// destination. \n
1214/// Bits [5:4]: \n
1215/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1216/// destination. \n
1217/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1218/// destination. \n
1219/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1220/// destination. \n
1221/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1222/// destination.
1223/// \returns A 256-bit vector of [4 x double] containing the copied values.
1224#define _mm256_permute2f128_pd(V1, V2, M) \
1225 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1226 (__v4df)(__m256d)(V2), (int)(M)))
1227
1228/// Permutes 128-bit data values stored in two 256-bit vectors of
1229/// [8 x float], as specified by the immediate integer operand.
1230///
1231/// \headerfile <x86intrin.h>
1232///
1233/// \code
1234/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1235/// \endcode
1236///
1237/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1238///
1239/// \param V1
1240/// A 256-bit vector of [8 x float].
1241/// \param V2
1242/// A 256-bit vector of [8 x float].
1243/// \param M
1244/// An immediate integer operand specifying how the values are to be
1245/// permuted. \n
1246/// Bits [1:0]: \n
1247/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1248/// destination. \n
1249/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1250/// destination. \n
1251/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1252/// destination. \n
1253/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1254/// destination. \n
1255/// Bits [5:4]: \n
1256/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1257/// destination. \n
1258/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1259/// destination. \n
1260/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1261/// destination. \n
1262/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1263/// destination.
1264/// \returns A 256-bit vector of [8 x float] containing the copied values.
1265#define _mm256_permute2f128_ps(V1, V2, M) \
1266 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1267 (__v8sf)(__m256)(V2), (int)(M)))
1268
1269/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1270/// as specified by the immediate integer operand.
1271///
1272/// \headerfile <x86intrin.h>
1273///
1274/// \code
1275/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1276/// \endcode
1277///
1278/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1279///
1280/// \param V1
1281/// A 256-bit integer vector.
1282/// \param V2
1283/// A 256-bit integer vector.
1284/// \param M
1285/// An immediate integer operand specifying how the values are to be copied.
1286/// Bits [1:0]: \n
1287/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1288/// destination. \n
1289/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1290/// destination. \n
1291/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1292/// destination. \n
1293/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1294/// destination. \n
1295/// Bits [5:4]: \n
1296/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1297/// destination. \n
1298/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1299/// destination. \n
1300/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1301/// destination. \n
1302/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1303/// destination.
1304/// \returns A 256-bit integer vector containing the copied values.
1305#define _mm256_permute2f128_si256(V1, V2, M) \
1306 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1307 (__v8si)(__m256i)(V2), (int)(M)))
1308
1309/* Vector Blend */
1310/// Merges 64-bit double-precision data values stored in either of the
1311/// two 256-bit vectors of [4 x double], as specified by the immediate
1312/// integer operand.
1313///
1314/// \headerfile <x86intrin.h>
1315///
1316/// \code
1317/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1318/// \endcode
1319///
1320/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1321///
1322/// \param V1
1323/// A 256-bit vector of [4 x double].
1324/// \param V2
1325/// A 256-bit vector of [4 x double].
1326/// \param M
1327/// An immediate integer operand, with mask bits [3:0] specifying how the
1328/// values are to be copied. The position of the mask bit corresponds to the
1329/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1330/// element in operand \a V1 is copied to the same position in the
1331/// destination. When a mask bit is 1, the corresponding 64-bit element in
1332/// operand \a V2 is copied to the same position in the destination.
1333/// \returns A 256-bit vector of [4 x double] containing the copied values.
1334#define _mm256_blend_pd(V1, V2, M) \
1335 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1336 (__v4df)(__m256d)(V2), (int)(M)))
1337
1338/// Merges 32-bit single-precision data values stored in either of the
1339/// two 256-bit vectors of [8 x float], as specified by the immediate
1340/// integer operand.
1341///
1342/// \headerfile <x86intrin.h>
1343///
1344/// \code
1345/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1346/// \endcode
1347///
1348/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1349///
1350/// \param V1
1351/// A 256-bit vector of [8 x float].
1352/// \param V2
1353/// A 256-bit vector of [8 x float].
1354/// \param M
1355/// An immediate integer operand, with mask bits [7:0] specifying how the
1356/// values are to be copied. The position of the mask bit corresponds to the
1357/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1358/// element in operand \a V1 is copied to the same position in the
1359/// destination. When a mask bit is 1, the corresponding 32-bit element in
1360/// operand \a V2 is copied to the same position in the destination.
1361/// \returns A 256-bit vector of [8 x float] containing the copied values.
1362#define _mm256_blend_ps(V1, V2, M) \
1363 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1364 (__v8sf)(__m256)(V2), (int)(M)))
1365
1366/// Merges 64-bit double-precision data values stored in either of the
1367/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1368/// operand.
1369///
1370/// \headerfile <x86intrin.h>
1371///
1372/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1373///
1374/// \param __a
1375/// A 256-bit vector of [4 x double].
1376/// \param __b
1377/// A 256-bit vector of [4 x double].
1378/// \param __c
1379/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1380/// how the values are to be copied. The position of the mask bit corresponds
1381/// to the most significant bit of a copied value. When a mask bit is 0, the
1382/// corresponding 64-bit element in operand \a __a is copied to the same
1383/// position in the destination. When a mask bit is 1, the corresponding
1384/// 64-bit element in operand \a __b is copied to the same position in the
1385/// destination.
1386/// \returns A 256-bit vector of [4 x double] containing the copied values.
1387static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
1388_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {
1389 return (__m256d)__builtin_ia32_blendvpd256(
1390 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1391}
1392
1393/// Merges 32-bit single-precision data values stored in either of the
1394/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1395/// operand.
1396///
1397/// \headerfile <x86intrin.h>
1398///
1399/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1400///
1401/// \param __a
1402/// A 256-bit vector of [8 x float].
1403/// \param __b
1404/// A 256-bit vector of [8 x float].
1405/// \param __c
1406/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1407/// and 31 specifying how the values are to be copied. The position of the
1408/// mask bit corresponds to the most significant bit of a copied value. When
1409/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1410/// copied to the same position in the destination. When a mask bit is 1, the
1411/// corresponding 32-bit element in operand \a __b is copied to the same
1412/// position in the destination.
1413/// \returns A 256-bit vector of [8 x float] containing the copied values.
1414static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
1415_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {
1416 return (__m256)__builtin_ia32_blendvps256(
1417 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1418}
1419
1420/* Vector Dot Product */
1421/// Computes two dot products in parallel, using the lower and upper
1422/// halves of two [8 x float] vectors as input to the two computations, and
1423/// returning the two dot products in the lower and upper halves of the
1424/// [8 x float] result.
1425///
1426/// The immediate integer operand controls which input elements will
1427/// contribute to the dot product, and where the final results are returned.
1428/// In general, for each dot product, the four corresponding elements of the
1429/// input vectors are multiplied; the first two and second two products are
1430/// summed, then the two sums are added to form the final result.
1431///
1432/// \headerfile <x86intrin.h>
1433///
1434/// \code
1435/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1436/// \endcode
1437///
1438/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1439///
1440/// \param V1
1441/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1442/// \param V2
1443/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1444/// \param M
1445/// An immediate integer argument. Bits [7:4] determine which elements of
1446/// the input vectors are used, with bit [4] corresponding to the lowest
1447/// element and bit [7] corresponding to the highest element of each [4 x
1448/// float] subvector. If a bit is set, the corresponding elements from the
1449/// two input vectors are used as an input for dot product; otherwise that
1450/// input is treated as zero. Bits [3:0] determine which elements of the
1451/// result will receive a copy of the final dot product, with bit [0]
1452/// corresponding to the lowest element and bit [3] corresponding to the
1453/// highest element of each [4 x float] subvector. If a bit is set, the dot
1454/// product is returned in the corresponding element; otherwise that element
1455/// is set to zero. The bitmask is applied in the same way to each of the
1456/// two parallel dot product computations.
1457/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1458#define _mm256_dp_ps(V1, V2, M) \
1459 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1460 (__v8sf)(__m256)(V2), (M)))
1461
1462/* Vector shuffle */
1463/// Selects 8 float values from the 256-bit operands of [8 x float], as
1464/// specified by the immediate value operand.
1465///
1466/// The four selected elements in each operand are copied to the destination
1467/// according to the bits specified in the immediate operand. The selected
1468/// elements from the first 256-bit operand are copied to bits [63:0] and
1469/// bits [191:128] of the destination, and the selected elements from the
1470/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1471/// the destination. For example, if bits [7:0] of the immediate operand
1472/// contain a value of 0xFF, the 256-bit destination vector would contain the
1473/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1474///
1475/// \headerfile <x86intrin.h>
1476///
1477/// \code
1478/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1479/// \endcode
1480///
1481/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1482///
1483/// \param a
1484/// A 256-bit vector of [8 x float]. The four selected elements in this
1485/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1486/// according to the bits specified in the immediate operand.
1487/// \param b
1488/// A 256-bit vector of [8 x float]. The four selected elements in this
1489/// operand are copied to bits [127:64] and bits [255:192] in the
1490/// destination, according to the bits specified in the immediate operand.
1491/// \param mask
1492/// An immediate value containing an 8-bit value specifying which elements to
1493/// copy from \a a and \a b \n.
1494/// Bits [3:0] specify the values copied from operand \a a. \n
1495/// Bits [7:4] specify the values copied from operand \a b. \n
1496/// The destinations within the 256-bit destination are assigned values as
1497/// follows, according to the bit value assignments described below: \n
1498/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1499/// destination. \n
1500/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1501/// destination. \n
1502/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1503/// destination. \n
1504/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1505/// the destination. \n
1506/// Bit value assignments: \n
1507/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1508/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1509/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1510/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1511/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1512/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1513/// <c>[b6, b4, b2, b0]</c>.
1514/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1515#define _mm256_shuffle_ps(a, b, mask) \
1516 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1517 (__v8sf)(__m256)(b), (int)(mask)))
1518
1519/// Selects four double-precision values from the 256-bit operands of
1520/// [4 x double], as specified by the immediate value operand.
1521///
1522/// The selected elements from the first 256-bit operand are copied to bits
1523/// [63:0] and bits [191:128] in the destination, and the selected elements
1524/// from the second 256-bit operand are copied to bits [127:64] and bits
1525/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1526/// operand contain a value of 0xF, the 256-bit destination vector would
1527/// contain the following values: b[3], a[3], b[1], a[1].
1528///
1529/// \headerfile <x86intrin.h>
1530///
1531/// \code
1532/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1533/// \endcode
1534///
1535/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1536///
1537/// \param a
1538/// A 256-bit vector of [4 x double].
1539/// \param b
1540/// A 256-bit vector of [4 x double].
1541/// \param mask
1542/// An immediate value containing 8-bit values specifying which elements to
1543/// copy from \a a and \a b: \n
1544/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1545/// destination. \n
1546/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1547/// destination. \n
1548/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1549/// destination. \n
1550/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1551/// destination. \n
1552/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1553/// destination. \n
1554/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1555/// destination. \n
1556/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1557/// destination. \n
1558/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1559/// destination.
1560/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1561#define _mm256_shuffle_pd(a, b, mask) \
1562 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1563 (__v4df)(__m256d)(b), (int)(mask)))
1564
1565/* Compare */
1566#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1567#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1568#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1569#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1570#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1571#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1572#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1573#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1574#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1575#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1576#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1577#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1578#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1579#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1580#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1581#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1582#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1583#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1584#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1585#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1586#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1587#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1588#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1589#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1590
1591/* Below intrinsic defined in emmintrin.h can be used for AVX */
1592/// Compares each of the corresponding double-precision values of two
1593/// 128-bit vectors of [2 x double], using the operation specified by the
1594/// immediate integer operand.
1595///
1596/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1597/// If either value in a comparison is NaN, comparisons that are ordered
1598/// return false, and comparisons that are unordered return true.
1599///
1600/// \headerfile <x86intrin.h>
1601///
1602/// \code
1603/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1604/// \endcode
1605///
1606/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1607///
1608/// \param a
1609/// A 128-bit vector of [2 x double].
1610/// \param b
1611/// A 128-bit vector of [2 x double].
1612/// \param c
1613/// An immediate integer operand, with bits [4:0] specifying which comparison
1614/// operation to use: \n
1615/// 0x00: Equal (ordered, non-signaling) \n
1616/// 0x01: Less-than (ordered, signaling) \n
1617/// 0x02: Less-than-or-equal (ordered, signaling) \n
1618/// 0x03: Unordered (non-signaling) \n
1619/// 0x04: Not-equal (unordered, non-signaling) \n
1620/// 0x05: Not-less-than (unordered, signaling) \n
1621/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1622/// 0x07: Ordered (non-signaling) \n
1623/// 0x08: Equal (unordered, non-signaling) \n
1624/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1625/// 0x0A: Not-greater-than (unordered, signaling) \n
1626/// 0x0B: False (ordered, non-signaling) \n
1627/// 0x0C: Not-equal (ordered, non-signaling) \n
1628/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1629/// 0x0E: Greater-than (ordered, signaling) \n
1630/// 0x0F: True (unordered, non-signaling) \n
1631/// 0x10: Equal (ordered, signaling) \n
1632/// 0x11: Less-than (ordered, non-signaling) \n
1633/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1634/// 0x13: Unordered (signaling) \n
1635/// 0x14: Not-equal (unordered, signaling) \n
1636/// 0x15: Not-less-than (unordered, non-signaling) \n
1637/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1638/// 0x17: Ordered (signaling) \n
1639/// 0x18: Equal (unordered, signaling) \n
1640/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1641/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1642/// 0x1B: False (ordered, signaling) \n
1643/// 0x1C: Not-equal (ordered, signaling) \n
1644/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1645/// 0x1E: Greater-than (ordered, non-signaling) \n
1646/// 0x1F: True (unordered, signaling)
1647/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1648/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1649
1650/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1651/// Compares each of the corresponding values of two 128-bit vectors of
1652/// [4 x float], using the operation specified by the immediate integer
1653/// operand.
1654///
1655/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1656/// If either value in a comparison is NaN, comparisons that are ordered
1657/// return false, and comparisons that are unordered return true.
1658///
1659/// \headerfile <x86intrin.h>
1660///
1661/// \code
1662/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1663/// \endcode
1664///
1665/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1666///
1667/// \param a
1668/// A 128-bit vector of [4 x float].
1669/// \param b
1670/// A 128-bit vector of [4 x float].
1671/// \param c
1672/// An immediate integer operand, with bits [4:0] specifying which comparison
1673/// operation to use: \n
1674/// 0x00: Equal (ordered, non-signaling) \n
1675/// 0x01: Less-than (ordered, signaling) \n
1676/// 0x02: Less-than-or-equal (ordered, signaling) \n
1677/// 0x03: Unordered (non-signaling) \n
1678/// 0x04: Not-equal (unordered, non-signaling) \n
1679/// 0x05: Not-less-than (unordered, signaling) \n
1680/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1681/// 0x07: Ordered (non-signaling) \n
1682/// 0x08: Equal (unordered, non-signaling) \n
1683/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1684/// 0x0A: Not-greater-than (unordered, signaling) \n
1685/// 0x0B: False (ordered, non-signaling) \n
1686/// 0x0C: Not-equal (ordered, non-signaling) \n
1687/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1688/// 0x0E: Greater-than (ordered, signaling) \n
1689/// 0x0F: True (unordered, non-signaling) \n
1690/// 0x10: Equal (ordered, signaling) \n
1691/// 0x11: Less-than (ordered, non-signaling) \n
1692/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1693/// 0x13: Unordered (signaling) \n
1694/// 0x14: Not-equal (unordered, signaling) \n
1695/// 0x15: Not-less-than (unordered, non-signaling) \n
1696/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1697/// 0x17: Ordered (signaling) \n
1698/// 0x18: Equal (unordered, signaling) \n
1699/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1700/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1701/// 0x1B: False (ordered, signaling) \n
1702/// 0x1C: Not-equal (ordered, signaling) \n
1703/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1704/// 0x1E: Greater-than (ordered, non-signaling) \n
1705/// 0x1F: True (unordered, signaling)
1706/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1707/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1708
1709/// Compares each of the corresponding double-precision values of two
1710/// 256-bit vectors of [4 x double], using the operation specified by the
1711/// immediate integer operand.
1712///
1713/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1714/// If either value in a comparison is NaN, comparisons that are ordered
1715/// return false, and comparisons that are unordered return true.
1716///
1717/// \headerfile <x86intrin.h>
1718///
1719/// \code
1720/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1721/// \endcode
1722///
1723/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1724///
1725/// \param a
1726/// A 256-bit vector of [4 x double].
1727/// \param b
1728/// A 256-bit vector of [4 x double].
1729/// \param c
1730/// An immediate integer operand, with bits [4:0] specifying which comparison
1731/// operation to use: \n
1732/// 0x00: Equal (ordered, non-signaling) \n
1733/// 0x01: Less-than (ordered, signaling) \n
1734/// 0x02: Less-than-or-equal (ordered, signaling) \n
1735/// 0x03: Unordered (non-signaling) \n
1736/// 0x04: Not-equal (unordered, non-signaling) \n
1737/// 0x05: Not-less-than (unordered, signaling) \n
1738/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1739/// 0x07: Ordered (non-signaling) \n
1740/// 0x08: Equal (unordered, non-signaling) \n
1741/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1742/// 0x0A: Not-greater-than (unordered, signaling) \n
1743/// 0x0B: False (ordered, non-signaling) \n
1744/// 0x0C: Not-equal (ordered, non-signaling) \n
1745/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1746/// 0x0E: Greater-than (ordered, signaling) \n
1747/// 0x0F: True (unordered, non-signaling) \n
1748/// 0x10: Equal (ordered, signaling) \n
1749/// 0x11: Less-than (ordered, non-signaling) \n
1750/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1751/// 0x13: Unordered (signaling) \n
1752/// 0x14: Not-equal (unordered, signaling) \n
1753/// 0x15: Not-less-than (unordered, non-signaling) \n
1754/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1755/// 0x17: Ordered (signaling) \n
1756/// 0x18: Equal (unordered, signaling) \n
1757/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1758/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1759/// 0x1B: False (ordered, signaling) \n
1760/// 0x1C: Not-equal (ordered, signaling) \n
1761/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1762/// 0x1E: Greater-than (ordered, non-signaling) \n
1763/// 0x1F: True (unordered, signaling)
1764/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1765#define _mm256_cmp_pd(a, b, c) \
1766 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1767 (__v4df)(__m256d)(b), (c)))
1768
1769/// Compares each of the corresponding values of two 256-bit vectors of
1770/// [8 x float], using the operation specified by the immediate integer
1771/// operand.
1772///
1773/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1774/// If either value in a comparison is NaN, comparisons that are ordered
1775/// return false, and comparisons that are unordered return true.
1776///
1777/// \headerfile <x86intrin.h>
1778///
1779/// \code
1780/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1781/// \endcode
1782///
1783/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1784///
1785/// \param a
1786/// A 256-bit vector of [8 x float].
1787/// \param b
1788/// A 256-bit vector of [8 x float].
1789/// \param c
1790/// An immediate integer operand, with bits [4:0] specifying which comparison
1791/// operation to use: \n
1792/// 0x00: Equal (ordered, non-signaling) \n
1793/// 0x01: Less-than (ordered, signaling) \n
1794/// 0x02: Less-than-or-equal (ordered, signaling) \n
1795/// 0x03: Unordered (non-signaling) \n
1796/// 0x04: Not-equal (unordered, non-signaling) \n
1797/// 0x05: Not-less-than (unordered, signaling) \n
1798/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1799/// 0x07: Ordered (non-signaling) \n
1800/// 0x08: Equal (unordered, non-signaling) \n
1801/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1802/// 0x0A: Not-greater-than (unordered, signaling) \n
1803/// 0x0B: False (ordered, non-signaling) \n
1804/// 0x0C: Not-equal (ordered, non-signaling) \n
1805/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1806/// 0x0E: Greater-than (ordered, signaling) \n
1807/// 0x0F: True (unordered, non-signaling) \n
1808/// 0x10: Equal (ordered, signaling) \n
1809/// 0x11: Less-than (ordered, non-signaling) \n
1810/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1811/// 0x13: Unordered (signaling) \n
1812/// 0x14: Not-equal (unordered, signaling) \n
1813/// 0x15: Not-less-than (unordered, non-signaling) \n
1814/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1815/// 0x17: Ordered (signaling) \n
1816/// 0x18: Equal (unordered, signaling) \n
1817/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1818/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1819/// 0x1B: False (ordered, signaling) \n
1820/// 0x1C: Not-equal (ordered, signaling) \n
1821/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1822/// 0x1E: Greater-than (ordered, non-signaling) \n
1823/// 0x1F: True (unordered, signaling)
1824/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1825#define _mm256_cmp_ps(a, b, c) \
1826 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1827 (__v8sf)(__m256)(b), (c)))
1828
1829/* Below intrinsic defined in emmintrin.h can be used for AVX */
1830/// Compares each of the corresponding scalar double-precision values of
1831/// two 128-bit vectors of [2 x double], using the operation specified by the
1832/// immediate integer operand.
1833///
1834/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1835/// If either value in a comparison is NaN, comparisons that are ordered
1836/// return false, and comparisons that are unordered return true.
1837///
1838/// \headerfile <x86intrin.h>
1839///
1840/// \code
1841/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1842/// \endcode
1843///
1844/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1845///
1846/// \param a
1847/// A 128-bit vector of [2 x double].
1848/// \param b
1849/// A 128-bit vector of [2 x double].
1850/// \param c
1851/// An immediate integer operand, with bits [4:0] specifying which comparison
1852/// operation to use: \n
1853/// 0x00: Equal (ordered, non-signaling) \n
1854/// 0x01: Less-than (ordered, signaling) \n
1855/// 0x02: Less-than-or-equal (ordered, signaling) \n
1856/// 0x03: Unordered (non-signaling) \n
1857/// 0x04: Not-equal (unordered, non-signaling) \n
1858/// 0x05: Not-less-than (unordered, signaling) \n
1859/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1860/// 0x07: Ordered (non-signaling) \n
1861/// 0x08: Equal (unordered, non-signaling) \n
1862/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1863/// 0x0A: Not-greater-than (unordered, signaling) \n
1864/// 0x0B: False (ordered, non-signaling) \n
1865/// 0x0C: Not-equal (ordered, non-signaling) \n
1866/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1867/// 0x0E: Greater-than (ordered, signaling) \n
1868/// 0x0F: True (unordered, non-signaling) \n
1869/// 0x10: Equal (ordered, signaling) \n
1870/// 0x11: Less-than (ordered, non-signaling) \n
1871/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1872/// 0x13: Unordered (signaling) \n
1873/// 0x14: Not-equal (unordered, signaling) \n
1874/// 0x15: Not-less-than (unordered, non-signaling) \n
1875/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1876/// 0x17: Ordered (signaling) \n
1877/// 0x18: Equal (unordered, signaling) \n
1878/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1879/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1880/// 0x1B: False (ordered, signaling) \n
1881/// 0x1C: Not-equal (ordered, signaling) \n
1882/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1883/// 0x1E: Greater-than (ordered, non-signaling) \n
1884/// 0x1F: True (unordered, signaling)
1885/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1886/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1887
1888/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1889/// Compares each of the corresponding scalar values of two 128-bit
1890/// vectors of [4 x float], using the operation specified by the immediate
1891/// integer operand.
1892///
1893/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1894/// If either value in a comparison is NaN, comparisons that are ordered
1895/// return false, and comparisons that are unordered return true.
1896///
1897/// \headerfile <x86intrin.h>
1898///
1899/// \code
1900/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1901/// \endcode
1902///
1903/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1904///
1905/// \param a
1906/// A 128-bit vector of [4 x float].
1907/// \param b
1908/// A 128-bit vector of [4 x float].
1909/// \param c
1910/// An immediate integer operand, with bits [4:0] specifying which comparison
1911/// operation to use: \n
1912/// 0x00: Equal (ordered, non-signaling) \n
1913/// 0x01: Less-than (ordered, signaling) \n
1914/// 0x02: Less-than-or-equal (ordered, signaling) \n
1915/// 0x03: Unordered (non-signaling) \n
1916/// 0x04: Not-equal (unordered, non-signaling) \n
1917/// 0x05: Not-less-than (unordered, signaling) \n
1918/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1919/// 0x07: Ordered (non-signaling) \n
1920/// 0x08: Equal (unordered, non-signaling) \n
1921/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1922/// 0x0A: Not-greater-than (unordered, signaling) \n
1923/// 0x0B: False (ordered, non-signaling) \n
1924/// 0x0C: Not-equal (ordered, non-signaling) \n
1925/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1926/// 0x0E: Greater-than (ordered, signaling) \n
1927/// 0x0F: True (unordered, non-signaling) \n
1928/// 0x10: Equal (ordered, signaling) \n
1929/// 0x11: Less-than (ordered, non-signaling) \n
1930/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1931/// 0x13: Unordered (signaling) \n
1932/// 0x14: Not-equal (unordered, signaling) \n
1933/// 0x15: Not-less-than (unordered, non-signaling) \n
1934/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1935/// 0x17: Ordered (signaling) \n
1936/// 0x18: Equal (unordered, signaling) \n
1937/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1938/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1939/// 0x1B: False (ordered, signaling) \n
1940/// 0x1C: Not-equal (ordered, signaling) \n
1941/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1942/// 0x1E: Greater-than (ordered, non-signaling) \n
1943/// 0x1F: True (unordered, signaling)
1944/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1945/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1946
1947/// Takes a [8 x i32] vector and returns the vector element value
1948/// indexed by the immediate constant operand.
1949///
1950/// \headerfile <x86intrin.h>
1951///
1952/// \code
1953/// int _mm256_extract_epi32(__m256i X, const int N);
1954/// \endcode
1955///
1956/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1957/// instruction.
1958///
1959/// \param X
1960/// A 256-bit vector of [8 x i32].
1961/// \param N
1962/// An immediate integer operand with bits [2:0] determining which vector
1963/// element is extracted and returned.
1964/// \returns A 32-bit integer containing the extracted 32 bits of extended
1965/// packed data.
1966#define _mm256_extract_epi32(X, N) \
1967 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1968
1969/// Takes a [16 x i16] vector and returns the vector element value
1970/// indexed by the immediate constant operand.
1971///
1972/// \headerfile <x86intrin.h>
1973///
1974/// \code
1975/// int _mm256_extract_epi16(__m256i X, const int N);
1976/// \endcode
1977///
1978/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1979/// instruction.
1980///
1981/// \param X
1982/// A 256-bit integer vector of [16 x i16].
1983/// \param N
1984/// An immediate integer operand with bits [3:0] determining which vector
1985/// element is extracted and returned.
1986/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
1987/// packed data.
1988#define _mm256_extract_epi16(X, N) \
1989 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
1990 (int)(N)))
1991
1992/// Takes a [32 x i8] vector and returns the vector element value
1993/// indexed by the immediate constant operand.
1994///
1995/// \headerfile <x86intrin.h>
1996///
1997/// \code
1998/// int _mm256_extract_epi8(__m256i X, const int N);
1999/// \endcode
2000///
2001/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2002/// instruction.
2003///
2004/// \param X
2005/// A 256-bit integer vector of [32 x i8].
2006/// \param N
2007/// An immediate integer operand with bits [4:0] determining which vector
2008/// element is extracted and returned.
2009/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2010/// packed data.
2011#define _mm256_extract_epi8(X, N) \
2012 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2013 (int)(N)))
2014
2015#ifdef __x86_64__
2016/// Takes a [4 x i64] vector and returns the vector element value
2017/// indexed by the immediate constant operand.
2018///
2019/// \headerfile <x86intrin.h>
2020///
2021/// \code
2022/// long long _mm256_extract_epi64(__m256i X, const int N);
2023/// \endcode
2024///
2025/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2026/// instruction.
2027///
2028/// \param X
2029/// A 256-bit integer vector of [4 x i64].
2030/// \param N
2031/// An immediate integer operand with bits [1:0] determining which vector
2032/// element is extracted and returned.
2033/// \returns A 64-bit integer containing the extracted 64 bits of extended
2034/// packed data.
2035#define _mm256_extract_epi64(X, N) \
2036 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2037#endif
2038
2039/// Takes a [8 x i32] vector and replaces the vector element value
2040/// indexed by the immediate constant operand by a new value. Returns the
2041/// modified vector.
2042///
2043/// \headerfile <x86intrin.h>
2044///
2045/// \code
2046/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2047/// \endcode
2048///
2049/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2050/// instruction.
2051///
2052/// \param X
2053/// A vector of [8 x i32] to be used by the insert operation.
2054/// \param I
2055/// An integer value. The replacement value for the insert operation.
2056/// \param N
2057/// An immediate integer specifying the index of the vector element to be
2058/// replaced.
2059/// \returns A copy of vector \a X, after replacing its element indexed by
2060/// \a N with \a I.
2061#define _mm256_insert_epi32(X, I, N) \
2062 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2063 (int)(I), (int)(N)))
2064
2065
2066/// Takes a [16 x i16] vector and replaces the vector element value
2067/// indexed by the immediate constant operand with a new value. Returns the
2068/// modified vector.
2069///
2070/// \headerfile <x86intrin.h>
2071///
2072/// \code
2073/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2074/// \endcode
2075///
2076/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2077/// instruction.
2078///
2079/// \param X
2080/// A vector of [16 x i16] to be used by the insert operation.
2081/// \param I
2082/// An i16 integer value. The replacement value for the insert operation.
2083/// \param N
2084/// An immediate integer specifying the index of the vector element to be
2085/// replaced.
2086/// \returns A copy of vector \a X, after replacing its element indexed by
2087/// \a N with \a I.
2088#define _mm256_insert_epi16(X, I, N) \
2089 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2090 (int)(I), (int)(N)))
2091
2092/// Takes a [32 x i8] vector and replaces the vector element value
2093/// indexed by the immediate constant operand with a new value. Returns the
2094/// modified vector.
2095///
2096/// \headerfile <x86intrin.h>
2097///
2098/// \code
2099/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2100/// \endcode
2101///
2102/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2103/// instruction.
2104///
2105/// \param X
2106/// A vector of [32 x i8] to be used by the insert operation.
2107/// \param I
2108/// An i8 integer value. The replacement value for the insert operation.
2109/// \param N
2110/// An immediate integer specifying the index of the vector element to be
2111/// replaced.
2112/// \returns A copy of vector \a X, after replacing its element indexed by
2113/// \a N with \a I.
2114#define _mm256_insert_epi8(X, I, N) \
2115 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2116 (int)(I), (int)(N)))
2117
2118#ifdef __x86_64__
2119/// Takes a [4 x i64] vector and replaces the vector element value
2120/// indexed by the immediate constant operand with a new value. Returns the
2121/// modified vector.
2122///
2123/// \headerfile <x86intrin.h>
2124///
2125/// \code
2126/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2127/// \endcode
2128///
2129/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2130/// instruction.
2131///
2132/// \param X
2133/// A vector of [4 x i64] to be used by the insert operation.
2134/// \param I
2135/// A 64-bit integer value. The replacement value for the insert operation.
2136/// \param N
2137/// An immediate integer specifying the index of the vector element to be
2138/// replaced.
2139/// \returns A copy of vector \a X, after replacing its element indexed by
2140/// \a N with \a I.
2141#define _mm256_insert_epi64(X, I, N) \
2142 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2143 (long long)(I), (int)(N)))
2144#endif
2145
2146/* Conversion */
2147/// Converts a vector of [4 x i32] into a vector of [4 x double].
2148///
2149/// \headerfile <x86intrin.h>
2150///
2151/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2152///
2153/// \param __a
2154/// A 128-bit integer vector of [4 x i32].
2155/// \returns A 256-bit vector of [4 x double] containing the converted values.
2156static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2158 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2159}
2160
2161/// Converts a vector of [8 x i32] into a vector of [8 x float].
2162///
2163/// \headerfile <x86intrin.h>
2164///
2165/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2166///
2167/// \param __a
2168/// A 256-bit integer vector.
2169/// \returns A 256-bit vector of [8 x float] containing the converted values.
2170static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2172 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2173}
2174
2175/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2176/// [4 x float].
2177///
2178/// \headerfile <x86intrin.h>
2179///
2180/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2181///
2182/// \param __a
2183/// A 256-bit vector of [4 x double].
2184/// \returns A 128-bit vector of [4 x float] containing the converted values.
2185static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2187 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2188}
2189
2190/// Converts a vector of [8 x float] into a vector of [8 x i32].
2191///
2192/// If a converted value does not fit in a 32-bit integer, raises a
2193/// floating-point invalid exception. If the exception is masked, returns
2194/// the most negative integer.
2195///
2196/// \headerfile <x86intrin.h>
2197///
2198/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2199///
2200/// \param __a
2201/// A 256-bit vector of [8 x float].
2202/// \returns A 256-bit integer vector containing the converted values.
2203static __inline __m256i __DEFAULT_FN_ATTRS
2205{
2206 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2207}
2208
2209/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2210/// x double].
2211///
2212/// \headerfile <x86intrin.h>
2213///
2214/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2215///
2216/// \param __a
2217/// A 128-bit vector of [4 x float].
2218/// \returns A 256-bit vector of [4 x double] containing the converted values.
2219static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2221 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2222}
2223
2224/// Converts a 256-bit vector of [4 x double] into four signed truncated
2225/// (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2226/// [4 x i32].
2227///
2228/// If a converted value does not fit in a 32-bit integer, raises a
2229/// floating-point invalid exception. If the exception is masked, returns
2230/// the most negative integer.
2231///
2232/// \headerfile <x86intrin.h>
2233///
2234/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2235///
2236/// \param __a
2237/// A 256-bit vector of [4 x double].
2238/// \returns A 128-bit integer vector containing the converted values.
2239static __inline __m128i __DEFAULT_FN_ATTRS
2241{
2242 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2243}
2244
2245/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2246/// [4 x i32].
2247///
2248/// If a converted value does not fit in a 32-bit integer, raises a
2249/// floating-point invalid exception. If the exception is masked, returns
2250/// the most negative integer.
2251///
2252/// \headerfile <x86intrin.h>
2253///
2254/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2255///
2256/// \param __a
2257/// A 256-bit vector of [4 x double].
2258/// \returns A 128-bit integer vector containing the converted values.
2259static __inline __m128i __DEFAULT_FN_ATTRS
2261{
2262 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2263}
2264
2265/// Converts a vector of [8 x float] into eight signed truncated (rounded
2266/// toward zero) 32-bit integers returned in a vector of [8 x i32].
2267///
2268/// If a converted value does not fit in a 32-bit integer, raises a
2269/// floating-point invalid exception. If the exception is masked, returns
2270/// the most negative integer.
2271///
2272/// \headerfile <x86intrin.h>
2273///
2274/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2275///
2276/// \param __a
2277/// A 256-bit vector of [8 x float].
2278/// \returns A 256-bit integer vector containing the converted values.
2279static __inline __m256i __DEFAULT_FN_ATTRS
2281{
2282 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2283}
2284
2285/// Returns the first element of the input vector of [4 x double].
2286///
2287/// \headerfile <x86intrin.h>
2288///
2289/// This intrinsic is a utility function and does not correspond to a specific
2290/// instruction.
2291///
2292/// \param __a
2293/// A 256-bit vector of [4 x double].
2294/// \returns A 64 bit double containing the first element of the input vector.
2295static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR
2297 return __a[0];
2298}
2299
2300/// Returns the first element of the input vector of [8 x i32].
2301///
2302/// \headerfile <x86intrin.h>
2303///
2304/// This intrinsic is a utility function and does not correspond to a specific
2305/// instruction.
2306///
2307/// \param __a
2308/// A 256-bit vector of [8 x i32].
2309/// \returns A 32 bit integer containing the first element of the input vector.
2310static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2312 __v8si __b = (__v8si)__a;
2313 return __b[0];
2314}
2315
2316/// Returns the first element of the input vector of [8 x float].
2317///
2318/// \headerfile <x86intrin.h>
2319///
2320/// This intrinsic is a utility function and does not correspond to a specific
2321/// instruction.
2322///
2323/// \param __a
2324/// A 256-bit vector of [8 x float].
2325/// \returns A 32 bit float containing the first element of the input vector.
2326static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR
2328 return __a[0];
2329}
2330
2331/* Vector replicate */
2332/// Moves and duplicates odd-indexed values from a 256-bit vector of
2333/// [8 x float] to float values in a 256-bit vector of [8 x float].
2334///
2335/// \headerfile <x86intrin.h>
2336///
2337/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2338///
2339/// \param __a
2340/// A 256-bit vector of [8 x float]. \n
2341/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2342/// the return value. \n
2343/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2344/// the return value. \n
2345/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2346/// return value. \n
2347/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2348/// return value.
2349/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2350/// values.
2351static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2353{
2354 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2355}
2356
2357/// Moves and duplicates even-indexed values from a 256-bit vector of
2358/// [8 x float] to float values in a 256-bit vector of [8 x float].
2359///
2360/// \headerfile <x86intrin.h>
2361///
2362/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2363///
2364/// \param __a
2365/// A 256-bit vector of [8 x float]. \n
2366/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2367/// the return value. \n
2368/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2369/// the return value. \n
2370/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2371/// return value. \n
2372/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2373/// return value.
2374/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2375/// values.
2376static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2378{
2379 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2380}
2381
2382/// Moves and duplicates double-precision floating point values from a
2383/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2384/// vector of [4 x double].
2385///
2386/// \headerfile <x86intrin.h>
2387///
2388/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2389///
2390/// \param __a
2391/// A 256-bit vector of [4 x double]. \n
2392/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2393/// return value. \n
2394/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2395/// the return value.
2396/// \returns A 256-bit vector of [4 x double] containing the moved and
2397/// duplicated values.
2398static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2400{
2401 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2402}
2403
2404/* Unpack and Interleave */
2405/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2406/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2407///
2408/// \headerfile <x86intrin.h>
2409///
2410/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2411///
2412/// \param __a
2413/// A 256-bit floating-point vector of [4 x double]. \n
2414/// Bits [127:64] are written to bits [63:0] of the return value. \n
2415/// Bits [255:192] are written to bits [191:128] of the return value. \n
2416/// \param __b
2417/// A 256-bit floating-point vector of [4 x double]. \n
2418/// Bits [127:64] are written to bits [127:64] of the return value. \n
2419/// Bits [255:192] are written to bits [255:192] of the return value. \n
2420/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2421static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2422_mm256_unpackhi_pd(__m256d __a, __m256d __b) {
2423 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2424}
2425
2426/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2427/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2428///
2429/// \headerfile <x86intrin.h>
2430///
2431/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2432///
2433/// \param __a
2434/// A 256-bit floating-point vector of [4 x double]. \n
2435/// Bits [63:0] are written to bits [63:0] of the return value. \n
2436/// Bits [191:128] are written to bits [191:128] of the return value.
2437/// \param __b
2438/// A 256-bit floating-point vector of [4 x double]. \n
2439/// Bits [63:0] are written to bits [127:64] of the return value. \n
2440/// Bits [191:128] are written to bits [255:192] of the return value. \n
2441/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2442static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2443_mm256_unpacklo_pd(__m256d __a, __m256d __b) {
2444 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2445}
2446
2447/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2448/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2449/// vector of [8 x float].
2450///
2451/// \headerfile <x86intrin.h>
2452///
2453/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2454///
2455/// \param __a
2456/// A 256-bit vector of [8 x float]. \n
2457/// Bits [95:64] are written to bits [31:0] of the return value. \n
2458/// Bits [127:96] are written to bits [95:64] of the return value. \n
2459/// Bits [223:192] are written to bits [159:128] of the return value. \n
2460/// Bits [255:224] are written to bits [223:192] of the return value.
2461/// \param __b
2462/// A 256-bit vector of [8 x float]. \n
2463/// Bits [95:64] are written to bits [63:32] of the return value. \n
2464/// Bits [127:96] are written to bits [127:96] of the return value. \n
2465/// Bits [223:192] are written to bits [191:160] of the return value. \n
2466/// Bits [255:224] are written to bits [255:224] of the return value.
2467/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2468static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2469_mm256_unpackhi_ps(__m256 __a, __m256 __b) {
2470 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2471}
2472
2473/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2474/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2475/// vector of [8 x float].
2476///
2477/// \headerfile <x86intrin.h>
2478///
2479/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2480///
2481/// \param __a
2482/// A 256-bit vector of [8 x float]. \n
2483/// Bits [31:0] are written to bits [31:0] of the return value. \n
2484/// Bits [63:32] are written to bits [95:64] of the return value. \n
2485/// Bits [159:128] are written to bits [159:128] of the return value. \n
2486/// Bits [191:160] are written to bits [223:192] of the return value.
2487/// \param __b
2488/// A 256-bit vector of [8 x float]. \n
2489/// Bits [31:0] are written to bits [63:32] of the return value. \n
2490/// Bits [63:32] are written to bits [127:96] of the return value. \n
2491/// Bits [159:128] are written to bits [191:160] of the return value. \n
2492/// Bits [191:160] are written to bits [255:224] of the return value.
2493/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2494static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2495_mm256_unpacklo_ps(__m256 __a, __m256 __b) {
2496 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2497}
2498
2499/* Bit Test */
2500/// Given two 128-bit floating-point vectors of [2 x double], perform an
2501/// element-by-element comparison of the double-precision element in the
2502/// first source vector and the corresponding element in the second source
2503/// vector.
2504///
2505/// The EFLAGS register is updated as follows: \n
2506/// If there is at least one pair of double-precision elements where the
2507/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2508/// ZF flag is set to 1. \n
2509/// If there is at least one pair of double-precision elements where the
2510/// sign-bit of the first element is 0 and the sign-bit of the second element
2511/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2512/// This intrinsic returns the value of the ZF flag.
2513///
2514/// \headerfile <x86intrin.h>
2515///
2516/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2517///
2518/// \param __a
2519/// A 128-bit vector of [2 x double].
2520/// \param __b
2521/// A 128-bit vector of [2 x double].
2522/// \returns the ZF flag in the EFLAGS register.
2524 __m128d __b) {
2525 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2526}
2527
2528/// Given two 128-bit floating-point vectors of [2 x double], perform an
2529/// element-by-element comparison of the double-precision element in the
2530/// first source vector and the corresponding element in the second source
2531/// vector.
2532///
2533/// The EFLAGS register is updated as follows: \n
2534/// If there is at least one pair of double-precision elements where the
2535/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2536/// ZF flag is set to 1. \n
2537/// If there is at least one pair of double-precision elements where the
2538/// sign-bit of the first element is 0 and the sign-bit of the second element
2539/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2540/// This intrinsic returns the value of the CF flag.
2541///
2542/// \headerfile <x86intrin.h>
2543///
2544/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2545///
2546/// \param __a
2547/// A 128-bit vector of [2 x double].
2548/// \param __b
2549/// A 128-bit vector of [2 x double].
2550/// \returns the CF flag in the EFLAGS register.
2552 __m128d __b) {
2553 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2554}
2555
2556/// Given two 128-bit floating-point vectors of [2 x double], perform an
2557/// element-by-element comparison of the double-precision element in the
2558/// first source vector and the corresponding element in the second source
2559/// vector.
2560///
2561/// The EFLAGS register is updated as follows: \n
2562/// If there is at least one pair of double-precision elements where the
2563/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2564/// ZF flag is set to 1. \n
2565/// If there is at least one pair of double-precision elements where the
2566/// sign-bit of the first element is 0 and the sign-bit of the second element
2567/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2568/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2569/// otherwise it returns 0.
2570///
2571/// \headerfile <x86intrin.h>
2572///
2573/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2574///
2575/// \param __a
2576/// A 128-bit vector of [2 x double].
2577/// \param __b
2578/// A 128-bit vector of [2 x double].
2579/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2580static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR
2581_mm_testnzc_pd(__m128d __a, __m128d __b) {
2582 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2583}
2584
2585/// Given two 128-bit floating-point vectors of [4 x float], perform an
2586/// element-by-element comparison of the single-precision element in the
2587/// first source vector and the corresponding element in the second source
2588/// vector.
2589///
2590/// The EFLAGS register is updated as follows: \n
2591/// If there is at least one pair of single-precision elements where the
2592/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2593/// ZF flag is set to 1. \n
2594/// If there is at least one pair of single-precision elements where the
2595/// sign-bit of the first element is 0 and the sign-bit of the second element
2596/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2597/// This intrinsic returns the value of the ZF flag.
2598///
2599/// \headerfile <x86intrin.h>
2600///
2601/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2602///
2603/// \param __a
2604/// A 128-bit vector of [4 x float].
2605/// \param __b
2606/// A 128-bit vector of [4 x float].
2607/// \returns the ZF flag.
2609 __m128 __b) {
2610 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2611}
2612
2613/// Given two 128-bit floating-point vectors of [4 x float], perform an
2614/// element-by-element comparison of the single-precision element in the
2615/// first source vector and the corresponding element in the second source
2616/// vector.
2617///
2618/// The EFLAGS register is updated as follows: \n
2619/// If there is at least one pair of single-precision elements where the
2620/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2621/// ZF flag is set to 1. \n
2622/// If there is at least one pair of single-precision elements where the
2623/// sign-bit of the first element is 0 and the sign-bit of the second element
2624/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2625/// This intrinsic returns the value of the CF flag.
2626///
2627/// \headerfile <x86intrin.h>
2628///
2629/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2630///
2631/// \param __a
2632/// A 128-bit vector of [4 x float].
2633/// \param __b
2634/// A 128-bit vector of [4 x float].
2635/// \returns the CF flag.
2637 __m128 __b) {
2638 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2639}
2640
2641/// Given two 128-bit floating-point vectors of [4 x float], perform an
2642/// element-by-element comparison of the single-precision element in the
2643/// first source vector and the corresponding element in the second source
2644/// vector.
2645///
2646/// The EFLAGS register is updated as follows: \n
2647/// If there is at least one pair of single-precision elements where the
2648/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2649/// ZF flag is set to 1. \n
2650/// If there is at least one pair of single-precision elements where the
2651/// sign-bit of the first element is 0 and the sign-bit of the second element
2652/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2653/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2654/// otherwise it returns 0.
2655///
2656/// \headerfile <x86intrin.h>
2657///
2658/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2659///
2660/// \param __a
2661/// A 128-bit vector of [4 x float].
2662/// \param __b
2663/// A 128-bit vector of [4 x float].
2664/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2666 __m128 __b) {
2667 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2668}
2669
2670/// Given two 256-bit floating-point vectors of [4 x double], perform an
2671/// element-by-element comparison of the double-precision elements in the
2672/// first source vector and the corresponding elements in the second source
2673/// vector.
2674///
2675/// The EFLAGS register is updated as follows: \n
2676/// If there is at least one pair of double-precision elements where the
2677/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2678/// ZF flag is set to 1. \n
2679/// If there is at least one pair of double-precision elements where the
2680/// sign-bit of the first element is 0 and the sign-bit of the second element
2681/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2682/// This intrinsic returns the value of the ZF flag.
2683///
2684/// \headerfile <x86intrin.h>
2685///
2686/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2687///
2688/// \param __a
2689/// A 256-bit vector of [4 x double].
2690/// \param __b
2691/// A 256-bit vector of [4 x double].
2692/// \returns the ZF flag.
2694 __m256d __b) {
2695 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2696}
2697
2698/// Given two 256-bit floating-point vectors of [4 x double], perform an
2699/// element-by-element comparison of the double-precision elements in the
2700/// first source vector and the corresponding elements in the second source
2701/// vector.
2702///
2703/// The EFLAGS register is updated as follows: \n
2704/// If there is at least one pair of double-precision elements where the
2705/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2706/// ZF flag is set to 1. \n
2707/// If there is at least one pair of double-precision elements where the
2708/// sign-bit of the first element is 0 and the sign-bit of the second element
2709/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2710/// This intrinsic returns the value of the CF flag.
2711///
2712/// \headerfile <x86intrin.h>
2713///
2714/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2715///
2716/// \param __a
2717/// A 256-bit vector of [4 x double].
2718/// \param __b
2719/// A 256-bit vector of [4 x double].
2720/// \returns the CF flag.
2722 __m256d __b) {
2723 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2724}
2725
2726/// Given two 256-bit floating-point vectors of [4 x double], perform an
2727/// element-by-element comparison of the double-precision elements in the
2728/// first source vector and the corresponding elements in the second source
2729/// vector.
2730///
2731/// The EFLAGS register is updated as follows: \n
2732/// If there is at least one pair of double-precision elements where the
2733/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2734/// ZF flag is set to 1. \n
2735/// If there is at least one pair of double-precision elements where the
2736/// sign-bit of the first element is 0 and the sign-bit of the second element
2737/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2738/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2739/// otherwise it returns 0.
2740///
2741/// \headerfile <x86intrin.h>
2742///
2743/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2744///
2745/// \param __a
2746/// A 256-bit vector of [4 x double].
2747/// \param __b
2748/// A 256-bit vector of [4 x double].
2749/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2750static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2751_mm256_testnzc_pd(__m256d __a, __m256d __b) {
2752 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2753}
2754
2755/// Given two 256-bit floating-point vectors of [8 x float], perform an
2756/// element-by-element comparison of the single-precision element in the
2757/// first source vector and the corresponding element in the second source
2758/// vector.
2759///
2760/// The EFLAGS register is updated as follows: \n
2761/// If there is at least one pair of single-precision elements where the
2762/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2763/// ZF flag is set to 1. \n
2764/// If there is at least one pair of single-precision elements where the
2765/// sign-bit of the first element is 0 and the sign-bit of the second element
2766/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2767/// This intrinsic returns the value of the ZF flag.
2768///
2769/// \headerfile <x86intrin.h>
2770///
2771/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2772///
2773/// \param __a
2774/// A 256-bit vector of [8 x float].
2775/// \param __b
2776/// A 256-bit vector of [8 x float].
2777/// \returns the ZF flag.
2779 __m256 __b) {
2780 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2781}
2782
2783/// Given two 256-bit floating-point vectors of [8 x float], perform an
2784/// element-by-element comparison of the single-precision element in the
2785/// first source vector and the corresponding element in the second source
2786/// vector.
2787///
2788/// The EFLAGS register is updated as follows: \n
2789/// If there is at least one pair of single-precision elements where the
2790/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2791/// ZF flag is set to 1. \n
2792/// If there is at least one pair of single-precision elements where the
2793/// sign-bit of the first element is 0 and the sign-bit of the second element
2794/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2795/// This intrinsic returns the value of the CF flag.
2796///
2797/// \headerfile <x86intrin.h>
2798///
2799/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2800///
2801/// \param __a
2802/// A 256-bit vector of [8 x float].
2803/// \param __b
2804/// A 256-bit vector of [8 x float].
2805/// \returns the CF flag.
2807 __m256 __b) {
2808 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2809}
2810
2811/// Given two 256-bit floating-point vectors of [8 x float], perform an
2812/// element-by-element comparison of the single-precision elements in the
2813/// first source vector and the corresponding elements in the second source
2814/// vector.
2815///
2816/// The EFLAGS register is updated as follows: \n
2817/// If there is at least one pair of single-precision elements where the
2818/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2819/// ZF flag is set to 1. \n
2820/// If there is at least one pair of single-precision elements where the
2821/// sign-bit of the first element is 0 and the sign-bit of the second element
2822/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2823/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2824/// otherwise it returns 0.
2825///
2826/// \headerfile <x86intrin.h>
2827///
2828/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2829///
2830/// \param __a
2831/// A 256-bit vector of [8 x float].
2832/// \param __b
2833/// A 256-bit vector of [8 x float].
2834/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2836 __m256 __b) {
2837 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2838}
2839
2840/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2841/// of the two source vectors.
2842///
2843/// The EFLAGS register is updated as follows: \n
2844/// If there is at least one pair of bits where both bits are 1, the ZF flag
2845/// is set to 0. Otherwise the ZF flag is set to 1. \n
2846/// If there is at least one pair of bits where the bit from the first source
2847/// vector is 0 and the bit from the second source vector is 1, the CF flag
2848/// is set to 0. Otherwise the CF flag is set to 1. \n
2849/// This intrinsic returns the value of the ZF flag.
2850///
2851/// \headerfile <x86intrin.h>
2852///
2853/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2854///
2855/// \param __a
2856/// A 256-bit integer vector.
2857/// \param __b
2858/// A 256-bit integer vector.
2859/// \returns the ZF flag.
2860static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2861_mm256_testz_si256(__m256i __a, __m256i __b) {
2862 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2863}
2864
2865/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2866/// of the two source vectors.
2867///
2868/// The EFLAGS register is updated as follows: \n
2869/// If there is at least one pair of bits where both bits are 1, the ZF flag
2870/// is set to 0. Otherwise the ZF flag is set to 1. \n
2871/// If there is at least one pair of bits where the bit from the first source
2872/// vector is 0 and the bit from the second source vector is 1, the CF flag
2873/// is set to 0. Otherwise the CF flag is set to 1. \n
2874/// This intrinsic returns the value of the CF flag.
2875///
2876/// \headerfile <x86intrin.h>
2877///
2878/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2879///
2880/// \param __a
2881/// A 256-bit integer vector.
2882/// \param __b
2883/// A 256-bit integer vector.
2884/// \returns the CF flag.
2885static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2886_mm256_testc_si256(__m256i __a, __m256i __b) {
2887 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2888}
2889
2890/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2891/// of the two source vectors.
2892///
2893/// The EFLAGS register is updated as follows: \n
2894/// If there is at least one pair of bits where both bits are 1, the ZF flag
2895/// is set to 0. Otherwise the ZF flag is set to 1. \n
2896/// If there is at least one pair of bits where the bit from the first source
2897/// vector is 0 and the bit from the second source vector is 1, the CF flag
2898/// is set to 0. Otherwise the CF flag is set to 1. \n
2899/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2900/// otherwise it returns 0.
2901///
2902/// \headerfile <x86intrin.h>
2903///
2904/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2905///
2906/// \param __a
2907/// A 256-bit integer vector.
2908/// \param __b
2909/// A 256-bit integer vector.
2910/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2911static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2912_mm256_testnzc_si256(__m256i __a, __m256i __b) {
2913 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2914}
2915
2916/* Vector extract sign mask */
2917/// Extracts the sign bits of double-precision floating point elements
2918/// in a 256-bit vector of [4 x double] and writes them to the lower order
2919/// bits of the return value.
2920///
2921/// \headerfile <x86intrin.h>
2922///
2923/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2924///
2925/// \param __a
2926/// A 256-bit vector of [4 x double] containing the double-precision
2927/// floating point values with sign bits to be extracted.
2928/// \returns The sign bits from the operand, written to bits [3:0].
2929static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2931 return __builtin_ia32_movmskpd256((__v4df)__a);
2932}
2933
2934/// Extracts the sign bits of single-precision floating point elements
2935/// in a 256-bit vector of [8 x float] and writes them to the lower order
2936/// bits of the return value.
2937///
2938/// \headerfile <x86intrin.h>
2939///
2940/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2941///
2942/// \param __a
2943/// A 256-bit vector of [8 x float] containing the single-precision floating
2944/// point values with sign bits to be extracted.
2945/// \returns The sign bits from the operand, written to bits [7:0].
2946static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2948 return __builtin_ia32_movmskps256((__v8sf)__a);
2949}
2950
2951/* Vector __zero */
2952/// Zeroes the contents of all XMM or YMM registers.
2953///
2954/// \headerfile <x86intrin.h>
2955///
2956/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2957static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2958_mm256_zeroall(void)
2959{
2960 __builtin_ia32_vzeroall();
2961}
2962
2963/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2964///
2965/// \headerfile <x86intrin.h>
2966///
2967/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2968static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2969_mm256_zeroupper(void)
2970{
2971 __builtin_ia32_vzeroupper();
2972}
2973
2974/* Vector load with broadcast */
2975/// Loads a scalar single-precision floating point value from the
2976/// specified address pointed to by \a __a and broadcasts it to the elements
2977/// of a [4 x float] vector.
2978///
2979/// \headerfile <x86intrin.h>
2980///
2981/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2982///
2983/// \param __a
2984/// The single-precision floating point value to be broadcast.
2985/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2986/// equal to the broadcast value.
2987static __inline __m128 __DEFAULT_FN_ATTRS128
2989{
2990 struct __mm_broadcast_ss_struct {
2991 float __f;
2992 } __attribute__((__packed__, __may_alias__));
2993 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
2994 return __extension__ (__m128){ __f, __f, __f, __f };
2995}
2996
2997/// Loads a scalar double-precision floating point value from the
2998/// specified address pointed to by \a __a and broadcasts it to the elements
2999/// of a [4 x double] vector.
3000///
3001/// \headerfile <x86intrin.h>
3002///
3003/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3004///
3005/// \param __a
3006/// The double-precision floating point value to be broadcast.
3007/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3008/// equal to the broadcast value.
3009static __inline __m256d __DEFAULT_FN_ATTRS
3011{
3012 struct __mm256_broadcast_sd_struct {
3013 double __d;
3014 } __attribute__((__packed__, __may_alias__));
3015 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3016 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3017}
3018
3019/// Loads a scalar single-precision floating point value from the
3020/// specified address pointed to by \a __a and broadcasts it to the elements
3021/// of a [8 x float] vector.
3022///
3023/// \headerfile <x86intrin.h>
3024///
3025/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3026///
3027/// \param __a
3028/// The single-precision floating point value to be broadcast.
3029/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3030/// equal to the broadcast value.
3031static __inline __m256 __DEFAULT_FN_ATTRS
3033{
3034 struct __mm256_broadcast_ss_struct {
3035 float __f;
3036 } __attribute__((__packed__, __may_alias__));
3037 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3038 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3039}
3040
3041/// Loads the data from a 128-bit vector of [2 x double] from the
3042/// specified address pointed to by \a __a and broadcasts it to 128-bit
3043/// elements in a 256-bit vector of [4 x double].
3044///
3045/// \headerfile <x86intrin.h>
3046///
3047/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3048///
3049/// \param __a
3050/// The 128-bit vector of [2 x double] to be broadcast.
3051/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3052/// equal to the broadcast value.
3053static __inline __m256d __DEFAULT_FN_ATTRS
3055{
3056 __m128d __b = _mm_loadu_pd((const double *)__a);
3057 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3058 0, 1, 0, 1);
3059}
3060
3061/// Loads the data from a 128-bit vector of [4 x float] from the
3062/// specified address pointed to by \a __a and broadcasts it to 128-bit
3063/// elements in a 256-bit vector of [8 x float].
3064///
3065/// \headerfile <x86intrin.h>
3066///
3067/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3068///
3069/// \param __a
3070/// The 128-bit vector of [4 x float] to be broadcast.
3071/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3072/// equal to the broadcast value.
3073static __inline __m256 __DEFAULT_FN_ATTRS
3075{
3076 __m128 __b = _mm_loadu_ps((const float *)__a);
3077 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3078 0, 1, 2, 3, 0, 1, 2, 3);
3079}
3080
3081/* SIMD load ops */
3082/// Loads 4 double-precision floating point values from a 32-byte aligned
3083/// memory location pointed to by \a __p into a vector of [4 x double].
3084///
3085/// \headerfile <x86intrin.h>
3086///
3087/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3088///
3089/// \param __p
3090/// A 32-byte aligned pointer to a memory location containing
3091/// double-precision floating point values.
3092/// \returns A 256-bit vector of [4 x double] containing the moved values.
3093static __inline __m256d __DEFAULT_FN_ATTRS
3094_mm256_load_pd(double const *__p)
3095{
3096 return *(const __m256d *)__p;
3097}
3098
3099/// Loads 8 single-precision floating point values from a 32-byte aligned
3100/// memory location pointed to by \a __p into a vector of [8 x float].
3101///
3102/// \headerfile <x86intrin.h>
3103///
3104/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3105///
3106/// \param __p
3107/// A 32-byte aligned pointer to a memory location containing float values.
3108/// \returns A 256-bit vector of [8 x float] containing the moved values.
3109static __inline __m256 __DEFAULT_FN_ATTRS
3110_mm256_load_ps(float const *__p)
3111{
3112 return *(const __m256 *)__p;
3113}
3114
3115/// Loads 4 double-precision floating point values from an unaligned
3116/// memory location pointed to by \a __p into a vector of [4 x double].
3117///
3118/// \headerfile <x86intrin.h>
3119///
3120/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3121///
3122/// \param __p
3123/// A pointer to a memory location containing double-precision floating
3124/// point values.
3125/// \returns A 256-bit vector of [4 x double] containing the moved values.
3126static __inline __m256d __DEFAULT_FN_ATTRS
3127_mm256_loadu_pd(double const *__p)
3128{
3129 struct __loadu_pd {
3130 __m256d_u __v;
3131 } __attribute__((__packed__, __may_alias__));
3132 return ((const struct __loadu_pd*)__p)->__v;
3133}
3134
3135/// Loads 8 single-precision floating point values from an unaligned
3136/// memory location pointed to by \a __p into a vector of [8 x float].
3137///
3138/// \headerfile <x86intrin.h>
3139///
3140/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3141///
3142/// \param __p
3143/// A pointer to a memory location containing single-precision floating
3144/// point values.
3145/// \returns A 256-bit vector of [8 x float] containing the moved values.
3146static __inline __m256 __DEFAULT_FN_ATTRS
3148{
3149 struct __loadu_ps {
3150 __m256_u __v;
3151 } __attribute__((__packed__, __may_alias__));
3152 return ((const struct __loadu_ps*)__p)->__v;
3153}
3154
3155/// Loads 256 bits of integer data from a 32-byte aligned memory
3156/// location pointed to by \a __p into elements of a 256-bit integer vector.
3157///
3158/// \headerfile <x86intrin.h>
3159///
3160/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3161///
3162/// \param __p
3163/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3164/// values.
3165/// \returns A 256-bit integer vector containing the moved values.
3166static __inline __m256i __DEFAULT_FN_ATTRS
3167_mm256_load_si256(__m256i const *__p)
3168{
3169 return *__p;
3170}
3171
3172/// Loads 256 bits of integer data from an unaligned memory location
3173/// pointed to by \a __p into a 256-bit integer vector.
3174///
3175/// \headerfile <x86intrin.h>
3176///
3177/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3178///
3179/// \param __p
3180/// A pointer to a 256-bit integer vector containing integer values.
3181/// \returns A 256-bit integer vector containing the moved values.
3182static __inline __m256i __DEFAULT_FN_ATTRS
3183_mm256_loadu_si256(__m256i_u const *__p)
3184{
3185 struct __loadu_si256 {
3186 __m256i_u __v;
3187 } __attribute__((__packed__, __may_alias__));
3188 return ((const struct __loadu_si256*)__p)->__v;
3189}
3190
3191/// Loads 256 bits of integer data from an unaligned memory location
3192/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3193/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3194/// line boundary.
3195///
3196/// \headerfile <x86intrin.h>
3197///
3198/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3199///
3200/// \param __p
3201/// A pointer to a 256-bit integer vector containing integer values.
3202/// \returns A 256-bit integer vector containing the moved values.
3203static __inline __m256i __DEFAULT_FN_ATTRS
3204_mm256_lddqu_si256(__m256i_u const *__p)
3205{
3206 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3207}
3208
3209/* SIMD store ops */
3210/// Stores double-precision floating point values from a 256-bit vector
3211/// of [4 x double] to a 32-byte aligned memory location pointed to by
3212/// \a __p.
3213///
3214/// \headerfile <x86intrin.h>
3215///
3216/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3217///
3218/// \param __p
3219/// A 32-byte aligned pointer to a memory location that will receive the
3220/// double-precision floaing point values.
3221/// \param __a
3222/// A 256-bit vector of [4 x double] containing the values to be moved.
3223static __inline void __DEFAULT_FN_ATTRS
3224_mm256_store_pd(double *__p, __m256d __a)
3225{
3226 *(__m256d *)__p = __a;
3227}
3228
3229/// Stores single-precision floating point values from a 256-bit vector
3230/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3231///
3232/// \headerfile <x86intrin.h>
3233///
3234/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3235///
3236/// \param __p
3237/// A 32-byte aligned pointer to a memory location that will receive the
3238/// float values.
3239/// \param __a
3240/// A 256-bit vector of [8 x float] containing the values to be moved.
3241static __inline void __DEFAULT_FN_ATTRS
3242_mm256_store_ps(float *__p, __m256 __a)
3243{
3244 *(__m256 *)__p = __a;
3245}
3246
3247/// Stores double-precision floating point values from a 256-bit vector
3248/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3249///
3250/// \headerfile <x86intrin.h>
3251///
3252/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3253///
3254/// \param __p
3255/// A pointer to a memory location that will receive the double-precision
3256/// floating point values.
3257/// \param __a
3258/// A 256-bit vector of [4 x double] containing the values to be moved.
3259static __inline void __DEFAULT_FN_ATTRS
3260_mm256_storeu_pd(double *__p, __m256d __a)
3261{
3262 struct __storeu_pd {
3263 __m256d_u __v;
3264 } __attribute__((__packed__, __may_alias__));
3265 ((struct __storeu_pd*)__p)->__v = __a;
3266}
3267
3268/// Stores single-precision floating point values from a 256-bit vector
3269/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3270///
3271/// \headerfile <x86intrin.h>
3272///
3273/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3274///
3275/// \param __p
3276/// A pointer to a memory location that will receive the float values.
3277/// \param __a
3278/// A 256-bit vector of [8 x float] containing the values to be moved.
3279static __inline void __DEFAULT_FN_ATTRS
3280_mm256_storeu_ps(float *__p, __m256 __a)
3281{
3282 struct __storeu_ps {
3283 __m256_u __v;
3284 } __attribute__((__packed__, __may_alias__));
3285 ((struct __storeu_ps*)__p)->__v = __a;
3286}
3287
3288/// Stores integer values from a 256-bit integer vector to a 32-byte
3289/// aligned memory location pointed to by \a __p.
3290///
3291/// \headerfile <x86intrin.h>
3292///
3293/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3294///
3295/// \param __p
3296/// A 32-byte aligned pointer to a memory location that will receive the
3297/// integer values.
3298/// \param __a
3299/// A 256-bit integer vector containing the values to be moved.
3300static __inline void __DEFAULT_FN_ATTRS
3301_mm256_store_si256(__m256i *__p, __m256i __a)
3302{
3303 *__p = __a;
3304}
3305
3306/// Stores integer values from a 256-bit integer vector to an unaligned
3307/// memory location pointed to by \a __p.
3308///
3309/// \headerfile <x86intrin.h>
3310///
3311/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3312///
3313/// \param __p
3314/// A pointer to a memory location that will receive the integer values.
3315/// \param __a
3316/// A 256-bit integer vector containing the values to be moved.
3317static __inline void __DEFAULT_FN_ATTRS
3318_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3319{
3320 struct __storeu_si256 {
3321 __m256i_u __v;
3322 } __attribute__((__packed__, __may_alias__));
3323 ((struct __storeu_si256*)__p)->__v = __a;
3324}
3325
3326/* Conditional load ops */
3327/// Conditionally loads double-precision floating point elements from a
3328/// memory location pointed to by \a __p into a 128-bit vector of
3329/// [2 x double], depending on the mask bits associated with each data
3330/// element.
3331///
3332/// \headerfile <x86intrin.h>
3333///
3334/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3335///
3336/// \param __p
3337/// A pointer to a memory location that contains the double-precision
3338/// floating point values.
3339/// \param __m
3340/// A 128-bit integer vector containing the mask. The most significant bit of
3341/// each data element represents the mask bits. If a mask bit is zero, the
3342/// corresponding value in the memory location is not loaded and the
3343/// corresponding field in the return value is set to zero.
3344/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3345static __inline __m128d __DEFAULT_FN_ATTRS128
3346_mm_maskload_pd(double const *__p, __m128i __m)
3347{
3348 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3349}
3350
3351/// Conditionally loads double-precision floating point elements from a
3352/// memory location pointed to by \a __p into a 256-bit vector of
3353/// [4 x double], depending on the mask bits associated with each data
3354/// element.
3355///
3356/// \headerfile <x86intrin.h>
3357///
3358/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3359///
3360/// \param __p
3361/// A pointer to a memory location that contains the double-precision
3362/// floating point values.
3363/// \param __m
3364/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3365/// significant bit of each quadword element represents the mask bits. If a
3366/// mask bit is zero, the corresponding value in the memory location is not
3367/// loaded and the corresponding field in the return value is set to zero.
3368/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3369static __inline __m256d __DEFAULT_FN_ATTRS
3370_mm256_maskload_pd(double const *__p, __m256i __m)
3371{
3372 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3373 (__v4di)__m);
3374}
3375
3376/// Conditionally loads single-precision floating point elements from a
3377/// memory location pointed to by \a __p into a 128-bit vector of
3378/// [4 x float], depending on the mask bits associated with each data
3379/// element.
3380///
3381/// \headerfile <x86intrin.h>
3382///
3383/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3384///
3385/// \param __p
3386/// A pointer to a memory location that contains the single-precision
3387/// floating point values.
3388/// \param __m
3389/// A 128-bit integer vector containing the mask. The most significant bit of
3390/// each data element represents the mask bits. If a mask bit is zero, the
3391/// corresponding value in the memory location is not loaded and the
3392/// corresponding field in the return value is set to zero.
3393/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3394static __inline __m128 __DEFAULT_FN_ATTRS128
3395_mm_maskload_ps(float const *__p, __m128i __m)
3396{
3397 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3398}
3399
3400/// Conditionally loads single-precision floating point elements from a
3401/// memory location pointed to by \a __p into a 256-bit vector of
3402/// [8 x float], depending on the mask bits associated with each data
3403/// element.
3404///
3405/// \headerfile <x86intrin.h>
3406///
3407/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3408///
3409/// \param __p
3410/// A pointer to a memory location that contains the single-precision
3411/// floating point values.
3412/// \param __m
3413/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3414/// significant bit of each dword element represents the mask bits. If a mask
3415/// bit is zero, the corresponding value in the memory location is not loaded
3416/// and the corresponding field in the return value is set to zero.
3417/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3418static __inline __m256 __DEFAULT_FN_ATTRS
3419_mm256_maskload_ps(float const *__p, __m256i __m)
3420{
3421 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3422}
3423
3424/* Conditional store ops */
3425/// Moves single-precision floating point values from a 256-bit vector
3426/// of [8 x float] to a memory location pointed to by \a __p, according to
3427/// the specified mask.
3428///
3429/// \headerfile <x86intrin.h>
3430///
3431/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3432///
3433/// \param __p
3434/// A pointer to a memory location that will receive the float values.
3435/// \param __m
3436/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3437/// significant bit of each dword element in the mask vector represents the
3438/// mask bits. If a mask bit is zero, the corresponding value from vector
3439/// \a __a is not stored and the corresponding field in the memory location
3440/// pointed to by \a __p is not changed.
3441/// \param __a
3442/// A 256-bit vector of [8 x float] containing the values to be stored.
3443static __inline void __DEFAULT_FN_ATTRS
3444_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3445{
3446 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3447}
3448
3449/// Moves double-precision values from a 128-bit vector of [2 x double]
3450/// to a memory location pointed to by \a __p, according to the specified
3451/// mask.
3452///
3453/// \headerfile <x86intrin.h>
3454///
3455/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3456///
3457/// \param __p
3458/// A pointer to a memory location that will receive the float values.
3459/// \param __m
3460/// A 128-bit integer vector containing the mask. The most significant bit of
3461/// each field in the mask vector represents the mask bits. If a mask bit is
3462/// zero, the corresponding value from vector \a __a is not stored and the
3463/// corresponding field in the memory location pointed to by \a __p is not
3464/// changed.
3465/// \param __a
3466/// A 128-bit vector of [2 x double] containing the values to be stored.
3467static __inline void __DEFAULT_FN_ATTRS128
3468_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3469{
3470 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3471}
3472
3473/// Moves double-precision values from a 256-bit vector of [4 x double]
3474/// to a memory location pointed to by \a __p, according to the specified
3475/// mask.
3476///
3477/// \headerfile <x86intrin.h>
3478///
3479/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3480///
3481/// \param __p
3482/// A pointer to a memory location that will receive the float values.
3483/// \param __m
3484/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3485/// significant bit of each quadword element in the mask vector represents
3486/// the mask bits. If a mask bit is zero, the corresponding value from vector
3487/// __a is not stored and the corresponding field in the memory location
3488/// pointed to by \a __p is not changed.
3489/// \param __a
3490/// A 256-bit vector of [4 x double] containing the values to be stored.
3491static __inline void __DEFAULT_FN_ATTRS
3492_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3493{
3494 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3495}
3496
3497/// Moves single-precision floating point values from a 128-bit vector
3498/// of [4 x float] to a memory location pointed to by \a __p, according to
3499/// the specified mask.
3500///
3501/// \headerfile <x86intrin.h>
3502///
3503/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3504///
3505/// \param __p
3506/// A pointer to a memory location that will receive the float values.
3507/// \param __m
3508/// A 128-bit integer vector containing the mask. The most significant bit of
3509/// each field in the mask vector represents the mask bits. If a mask bit is
3510/// zero, the corresponding value from vector __a is not stored and the
3511/// corresponding field in the memory location pointed to by \a __p is not
3512/// changed.
3513/// \param __a
3514/// A 128-bit vector of [4 x float] containing the values to be stored.
3515static __inline void __DEFAULT_FN_ATTRS128
3516_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3517{
3518 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3519}
3520
3521/* Cacheability support ops */
3522/// Moves integer data from a 256-bit integer vector to a 32-byte
3523/// aligned memory location. To minimize caching, the data is flagged as
3524/// non-temporal (unlikely to be used again soon).
3525///
3526/// \headerfile <x86intrin.h>
3527///
3528/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3529///
3530/// \param __a
3531/// A pointer to a 32-byte aligned memory location that will receive the
3532/// integer values.
3533/// \param __b
3534/// A 256-bit integer vector containing the values to be moved.
3535static __inline void __DEFAULT_FN_ATTRS
3537{
3538 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3539 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3540}
3541
3542/// Moves double-precision values from a 256-bit vector of [4 x double]
3543/// to a 32-byte aligned memory location. To minimize caching, the data is
3544/// flagged as non-temporal (unlikely to be used again soon).
3545///
3546/// \headerfile <x86intrin.h>
3547///
3548/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3549///
3550/// \param __a
3551/// A pointer to a 32-byte aligned memory location that will receive the
3552/// double-precision floating-point values.
3553/// \param __b
3554/// A 256-bit vector of [4 x double] containing the values to be moved.
3555static __inline void __DEFAULT_FN_ATTRS
3556_mm256_stream_pd(void *__a, __m256d __b)
3557{
3558 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3559 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3560}
3561
3562/// Moves single-precision floating point values from a 256-bit vector
3563/// of [8 x float] to a 32-byte aligned memory location. To minimize
3564/// caching, the data is flagged as non-temporal (unlikely to be used again
3565/// soon).
3566///
3567/// \headerfile <x86intrin.h>
3568///
3569/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3570///
3571/// \param __p
3572/// A pointer to a 32-byte aligned memory location that will receive the
3573/// single-precision floating point values.
3574/// \param __a
3575/// A 256-bit vector of [8 x float] containing the values to be moved.
3576static __inline void __DEFAULT_FN_ATTRS
3578{
3579 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3580 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3581}
3582
3583/* Create vectors */
3584/// Create a 256-bit vector of [4 x double] with undefined values.
3585///
3586/// \headerfile <x86intrin.h>
3587///
3588/// This intrinsic has no corresponding instruction.
3589///
3590/// \returns A 256-bit vector of [4 x double] containing undefined values.
3591static __inline__ __m256d __DEFAULT_FN_ATTRS
3593{
3594 return (__m256d)__builtin_ia32_undef256();
3595}
3596
3597/// Create a 256-bit vector of [8 x float] with undefined values.
3598///
3599/// \headerfile <x86intrin.h>
3600///
3601/// This intrinsic has no corresponding instruction.
3602///
3603/// \returns A 256-bit vector of [8 x float] containing undefined values.
3604static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void) {
3605 return (__m256)__builtin_ia32_undef256();
3606}
3607
3608/// Create a 256-bit integer vector with undefined values.
3609///
3610/// \headerfile <x86intrin.h>
3611///
3612/// This intrinsic has no corresponding instruction.
3613///
3614/// \returns A 256-bit integer vector containing undefined values.
3615static __inline__ __m256i __DEFAULT_FN_ATTRS
3617{
3618 return (__m256i)__builtin_ia32_undef256();
3619}
3620
3621/// Constructs a 256-bit floating-point vector of [4 x double]
3622/// initialized with the specified double-precision floating-point values.
3623///
3624/// \headerfile <x86intrin.h>
3625///
3626/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3627/// instruction.
3628///
3629/// \param __a
3630/// A double-precision floating-point value used to initialize bits [255:192]
3631/// of the result.
3632/// \param __b
3633/// A double-precision floating-point value used to initialize bits [191:128]
3634/// of the result.
3635/// \param __c
3636/// A double-precision floating-point value used to initialize bits [127:64]
3637/// of the result.
3638/// \param __d
3639/// A double-precision floating-point value used to initialize bits [63:0]
3640/// of the result.
3641/// \returns An initialized 256-bit floating-point vector of [4 x double].
3642static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3643_mm256_set_pd(double __a, double __b, double __c, double __d)
3644{
3645 return __extension__ (__m256d){ __d, __c, __b, __a };
3646}
3647
3648/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3649/// with the specified single-precision floating-point values.
3650///
3651/// \headerfile <x86intrin.h>
3652///
3653/// This intrinsic is a utility function and does not correspond to a specific
3654/// instruction.
3655///
3656/// \param __a
3657/// A single-precision floating-point value used to initialize bits [255:224]
3658/// of the result.
3659/// \param __b
3660/// A single-precision floating-point value used to initialize bits [223:192]
3661/// of the result.
3662/// \param __c
3663/// A single-precision floating-point value used to initialize bits [191:160]
3664/// of the result.
3665/// \param __d
3666/// A single-precision floating-point value used to initialize bits [159:128]
3667/// of the result.
3668/// \param __e
3669/// A single-precision floating-point value used to initialize bits [127:96]
3670/// of the result.
3671/// \param __f
3672/// A single-precision floating-point value used to initialize bits [95:64]
3673/// of the result.
3674/// \param __g
3675/// A single-precision floating-point value used to initialize bits [63:32]
3676/// of the result.
3677/// \param __h
3678/// A single-precision floating-point value used to initialize bits [31:0]
3679/// of the result.
3680/// \returns An initialized 256-bit floating-point vector of [8 x float].
3681static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3682_mm256_set_ps(float __a, float __b, float __c, float __d,
3683 float __e, float __f, float __g, float __h)
3684{
3685 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3686}
3687
3688/// Constructs a 256-bit integer vector initialized with the specified
3689/// 32-bit integral values.
3690///
3691/// \headerfile <x86intrin.h>
3692///
3693/// This intrinsic is a utility function and does not correspond to a specific
3694/// instruction.
3695///
3696/// \param __i0
3697/// A 32-bit integral value used to initialize bits [255:224] of the result.
3698/// \param __i1
3699/// A 32-bit integral value used to initialize bits [223:192] of the result.
3700/// \param __i2
3701/// A 32-bit integral value used to initialize bits [191:160] of the result.
3702/// \param __i3
3703/// A 32-bit integral value used to initialize bits [159:128] of the result.
3704/// \param __i4
3705/// A 32-bit integral value used to initialize bits [127:96] of the result.
3706/// \param __i5
3707/// A 32-bit integral value used to initialize bits [95:64] of the result.
3708/// \param __i6
3709/// A 32-bit integral value used to initialize bits [63:32] of the result.
3710/// \param __i7
3711/// A 32-bit integral value used to initialize bits [31:0] of the result.
3712/// \returns An initialized 256-bit integer vector.
3713static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3714_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3715 int __i4, int __i5, int __i6, int __i7)
3716{
3717 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3718}
3719
3720/// Constructs a 256-bit integer vector initialized with the specified
3721/// 16-bit integral values.
3722///
3723/// \headerfile <x86intrin.h>
3724///
3725/// This intrinsic is a utility function and does not correspond to a specific
3726/// instruction.
3727///
3728/// \param __w15
3729/// A 16-bit integral value used to initialize bits [255:240] of the result.
3730/// \param __w14
3731/// A 16-bit integral value used to initialize bits [239:224] of the result.
3732/// \param __w13
3733/// A 16-bit integral value used to initialize bits [223:208] of the result.
3734/// \param __w12
3735/// A 16-bit integral value used to initialize bits [207:192] of the result.
3736/// \param __w11
3737/// A 16-bit integral value used to initialize bits [191:176] of the result.
3738/// \param __w10
3739/// A 16-bit integral value used to initialize bits [175:160] of the result.
3740/// \param __w09
3741/// A 16-bit integral value used to initialize bits [159:144] of the result.
3742/// \param __w08
3743/// A 16-bit integral value used to initialize bits [143:128] of the result.
3744/// \param __w07
3745/// A 16-bit integral value used to initialize bits [127:112] of the result.
3746/// \param __w06
3747/// A 16-bit integral value used to initialize bits [111:96] of the result.
3748/// \param __w05
3749/// A 16-bit integral value used to initialize bits [95:80] of the result.
3750/// \param __w04
3751/// A 16-bit integral value used to initialize bits [79:64] of the result.
3752/// \param __w03
3753/// A 16-bit integral value used to initialize bits [63:48] of the result.
3754/// \param __w02
3755/// A 16-bit integral value used to initialize bits [47:32] of the result.
3756/// \param __w01
3757/// A 16-bit integral value used to initialize bits [31:16] of the result.
3758/// \param __w00
3759/// A 16-bit integral value used to initialize bits [15:0] of the result.
3760/// \returns An initialized 256-bit integer vector.
3761static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3762_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3763 short __w11, short __w10, short __w09, short __w08,
3764 short __w07, short __w06, short __w05, short __w04,
3765 short __w03, short __w02, short __w01, short __w00)
3766{
3767 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3768 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3769}
3770
3771/// Constructs a 256-bit integer vector initialized with the specified
3772/// 8-bit integral values.
3773///
3774/// \headerfile <x86intrin.h>
3775///
3776/// This intrinsic is a utility function and does not correspond to a specific
3777/// instruction.
3778///
3779/// \param __b31
3780/// An 8-bit integral value used to initialize bits [255:248] of the result.
3781/// \param __b30
3782/// An 8-bit integral value used to initialize bits [247:240] of the result.
3783/// \param __b29
3784/// An 8-bit integral value used to initialize bits [239:232] of the result.
3785/// \param __b28
3786/// An 8-bit integral value used to initialize bits [231:224] of the result.
3787/// \param __b27
3788/// An 8-bit integral value used to initialize bits [223:216] of the result.
3789/// \param __b26
3790/// An 8-bit integral value used to initialize bits [215:208] of the result.
3791/// \param __b25
3792/// An 8-bit integral value used to initialize bits [207:200] of the result.
3793/// \param __b24
3794/// An 8-bit integral value used to initialize bits [199:192] of the result.
3795/// \param __b23
3796/// An 8-bit integral value used to initialize bits [191:184] of the result.
3797/// \param __b22
3798/// An 8-bit integral value used to initialize bits [183:176] of the result.
3799/// \param __b21
3800/// An 8-bit integral value used to initialize bits [175:168] of the result.
3801/// \param __b20
3802/// An 8-bit integral value used to initialize bits [167:160] of the result.
3803/// \param __b19
3804/// An 8-bit integral value used to initialize bits [159:152] of the result.
3805/// \param __b18
3806/// An 8-bit integral value used to initialize bits [151:144] of the result.
3807/// \param __b17
3808/// An 8-bit integral value used to initialize bits [143:136] of the result.
3809/// \param __b16
3810/// An 8-bit integral value used to initialize bits [135:128] of the result.
3811/// \param __b15
3812/// An 8-bit integral value used to initialize bits [127:120] of the result.
3813/// \param __b14
3814/// An 8-bit integral value used to initialize bits [119:112] of the result.
3815/// \param __b13
3816/// An 8-bit integral value used to initialize bits [111:104] of the result.
3817/// \param __b12
3818/// An 8-bit integral value used to initialize bits [103:96] of the result.
3819/// \param __b11
3820/// An 8-bit integral value used to initialize bits [95:88] of the result.
3821/// \param __b10
3822/// An 8-bit integral value used to initialize bits [87:80] of the result.
3823/// \param __b09
3824/// An 8-bit integral value used to initialize bits [79:72] of the result.
3825/// \param __b08
3826/// An 8-bit integral value used to initialize bits [71:64] of the result.
3827/// \param __b07
3828/// An 8-bit integral value used to initialize bits [63:56] of the result.
3829/// \param __b06
3830/// An 8-bit integral value used to initialize bits [55:48] of the result.
3831/// \param __b05
3832/// An 8-bit integral value used to initialize bits [47:40] of the result.
3833/// \param __b04
3834/// An 8-bit integral value used to initialize bits [39:32] of the result.
3835/// \param __b03
3836/// An 8-bit integral value used to initialize bits [31:24] of the result.
3837/// \param __b02
3838/// An 8-bit integral value used to initialize bits [23:16] of the result.
3839/// \param __b01
3840/// An 8-bit integral value used to initialize bits [15:8] of the result.
3841/// \param __b00
3842/// An 8-bit integral value used to initialize bits [7:0] of the result.
3843/// \returns An initialized 256-bit integer vector.
3844static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3845_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3846 char __b27, char __b26, char __b25, char __b24,
3847 char __b23, char __b22, char __b21, char __b20,
3848 char __b19, char __b18, char __b17, char __b16,
3849 char __b15, char __b14, char __b13, char __b12,
3850 char __b11, char __b10, char __b09, char __b08,
3851 char __b07, char __b06, char __b05, char __b04,
3852 char __b03, char __b02, char __b01, char __b00)
3853{
3854 return __extension__ (__m256i)(__v32qi){
3855 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3856 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3857 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3858 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3859 };
3860}
3861
3862/// Constructs a 256-bit integer vector initialized with the specified
3863/// 64-bit integral values.
3864///
3865/// \headerfile <x86intrin.h>
3866///
3867/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3868/// instruction.
3869///
3870/// \param __a
3871/// A 64-bit integral value used to initialize bits [255:192] of the result.
3872/// \param __b
3873/// A 64-bit integral value used to initialize bits [191:128] of the result.
3874/// \param __c
3875/// A 64-bit integral value used to initialize bits [127:64] of the result.
3876/// \param __d
3877/// A 64-bit integral value used to initialize bits [63:0] of the result.
3878/// \returns An initialized 256-bit integer vector.
3879static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3880_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3881{
3882 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3883}
3884
3885/* Create vectors with elements in reverse order */
3886/// Constructs a 256-bit floating-point vector of [4 x double],
3887/// initialized in reverse order with the specified double-precision
3888/// floating-point values.
3889///
3890/// \headerfile <x86intrin.h>
3891///
3892/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3893/// instruction.
3894///
3895/// \param __a
3896/// A double-precision floating-point value used to initialize bits [63:0]
3897/// of the result.
3898/// \param __b
3899/// A double-precision floating-point value used to initialize bits [127:64]
3900/// of the result.
3901/// \param __c
3902/// A double-precision floating-point value used to initialize bits [191:128]
3903/// of the result.
3904/// \param __d
3905/// A double-precision floating-point value used to initialize bits [255:192]
3906/// of the result.
3907/// \returns An initialized 256-bit floating-point vector of [4 x double].
3908static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3909_mm256_setr_pd(double __a, double __b, double __c, double __d)
3910{
3911 return _mm256_set_pd(__d, __c, __b, __a);
3912}
3913
3914/// Constructs a 256-bit floating-point vector of [8 x float],
3915/// initialized in reverse order with the specified single-precision
3916/// float-point values.
3917///
3918/// \headerfile <x86intrin.h>
3919///
3920/// This intrinsic is a utility function and does not correspond to a specific
3921/// instruction.
3922///
3923/// \param __a
3924/// A single-precision floating-point value used to initialize bits [31:0]
3925/// of the result.
3926/// \param __b
3927/// A single-precision floating-point value used to initialize bits [63:32]
3928/// of the result.
3929/// \param __c
3930/// A single-precision floating-point value used to initialize bits [95:64]
3931/// of the result.
3932/// \param __d
3933/// A single-precision floating-point value used to initialize bits [127:96]
3934/// of the result.
3935/// \param __e
3936/// A single-precision floating-point value used to initialize bits [159:128]
3937/// of the result.
3938/// \param __f
3939/// A single-precision floating-point value used to initialize bits [191:160]
3940/// of the result.
3941/// \param __g
3942/// A single-precision floating-point value used to initialize bits [223:192]
3943/// of the result.
3944/// \param __h
3945/// A single-precision floating-point value used to initialize bits [255:224]
3946/// of the result.
3947/// \returns An initialized 256-bit floating-point vector of [8 x float].
3948static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3949_mm256_setr_ps(float __a, float __b, float __c, float __d,
3950 float __e, float __f, float __g, float __h)
3951{
3952 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3953}
3954
3955/// Constructs a 256-bit integer vector, initialized in reverse order
3956/// with the specified 32-bit integral values.
3957///
3958/// \headerfile <x86intrin.h>
3959///
3960/// This intrinsic is a utility function and does not correspond to a specific
3961/// instruction.
3962///
3963/// \param __i0
3964/// A 32-bit integral value used to initialize bits [31:0] of the result.
3965/// \param __i1
3966/// A 32-bit integral value used to initialize bits [63:32] of the result.
3967/// \param __i2
3968/// A 32-bit integral value used to initialize bits [95:64] of the result.
3969/// \param __i3
3970/// A 32-bit integral value used to initialize bits [127:96] of the result.
3971/// \param __i4
3972/// A 32-bit integral value used to initialize bits [159:128] of the result.
3973/// \param __i5
3974/// A 32-bit integral value used to initialize bits [191:160] of the result.
3975/// \param __i6
3976/// A 32-bit integral value used to initialize bits [223:192] of the result.
3977/// \param __i7
3978/// A 32-bit integral value used to initialize bits [255:224] of the result.
3979/// \returns An initialized 256-bit integer vector.
3980static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3981_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3982 int __i4, int __i5, int __i6, int __i7)
3983{
3984 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
3985}
3986
3987/// Constructs a 256-bit integer vector, initialized in reverse order
3988/// with the specified 16-bit integral values.
3989///
3990/// \headerfile <x86intrin.h>
3991///
3992/// This intrinsic is a utility function and does not correspond to a specific
3993/// instruction.
3994///
3995/// \param __w15
3996/// A 16-bit integral value used to initialize bits [15:0] of the result.
3997/// \param __w14
3998/// A 16-bit integral value used to initialize bits [31:16] of the result.
3999/// \param __w13
4000/// A 16-bit integral value used to initialize bits [47:32] of the result.
4001/// \param __w12
4002/// A 16-bit integral value used to initialize bits [63:48] of the result.
4003/// \param __w11
4004/// A 16-bit integral value used to initialize bits [79:64] of the result.
4005/// \param __w10
4006/// A 16-bit integral value used to initialize bits [95:80] of the result.
4007/// \param __w09
4008/// A 16-bit integral value used to initialize bits [111:96] of the result.
4009/// \param __w08
4010/// A 16-bit integral value used to initialize bits [127:112] of the result.
4011/// \param __w07
4012/// A 16-bit integral value used to initialize bits [143:128] of the result.
4013/// \param __w06
4014/// A 16-bit integral value used to initialize bits [159:144] of the result.
4015/// \param __w05
4016/// A 16-bit integral value used to initialize bits [175:160] of the result.
4017/// \param __w04
4018/// A 16-bit integral value used to initialize bits [191:176] of the result.
4019/// \param __w03
4020/// A 16-bit integral value used to initialize bits [207:192] of the result.
4021/// \param __w02
4022/// A 16-bit integral value used to initialize bits [223:208] of the result.
4023/// \param __w01
4024/// A 16-bit integral value used to initialize bits [239:224] of the result.
4025/// \param __w00
4026/// A 16-bit integral value used to initialize bits [255:240] of the result.
4027/// \returns An initialized 256-bit integer vector.
4028static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4029_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4030 short __w11, short __w10, short __w09, short __w08,
4031 short __w07, short __w06, short __w05, short __w04,
4032 short __w03, short __w02, short __w01, short __w00)
4033{
4034 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4035 __w04, __w05, __w06, __w07,
4036 __w08, __w09, __w10, __w11,
4037 __w12, __w13, __w14, __w15);
4038}
4039
4040/// Constructs a 256-bit integer vector, initialized in reverse order
4041/// with the specified 8-bit integral values.
4042///
4043/// \headerfile <x86intrin.h>
4044///
4045/// This intrinsic is a utility function and does not correspond to a specific
4046/// instruction.
4047///
4048/// \param __b31
4049/// An 8-bit integral value used to initialize bits [7:0] of the result.
4050/// \param __b30
4051/// An 8-bit integral value used to initialize bits [15:8] of the result.
4052/// \param __b29
4053/// An 8-bit integral value used to initialize bits [23:16] of the result.
4054/// \param __b28
4055/// An 8-bit integral value used to initialize bits [31:24] of the result.
4056/// \param __b27
4057/// An 8-bit integral value used to initialize bits [39:32] of the result.
4058/// \param __b26
4059/// An 8-bit integral value used to initialize bits [47:40] of the result.
4060/// \param __b25
4061/// An 8-bit integral value used to initialize bits [55:48] of the result.
4062/// \param __b24
4063/// An 8-bit integral value used to initialize bits [63:56] of the result.
4064/// \param __b23
4065/// An 8-bit integral value used to initialize bits [71:64] of the result.
4066/// \param __b22
4067/// An 8-bit integral value used to initialize bits [79:72] of the result.
4068/// \param __b21
4069/// An 8-bit integral value used to initialize bits [87:80] of the result.
4070/// \param __b20
4071/// An 8-bit integral value used to initialize bits [95:88] of the result.
4072/// \param __b19
4073/// An 8-bit integral value used to initialize bits [103:96] of the result.
4074/// \param __b18
4075/// An 8-bit integral value used to initialize bits [111:104] of the result.
4076/// \param __b17
4077/// An 8-bit integral value used to initialize bits [119:112] of the result.
4078/// \param __b16
4079/// An 8-bit integral value used to initialize bits [127:120] of the result.
4080/// \param __b15
4081/// An 8-bit integral value used to initialize bits [135:128] of the result.
4082/// \param __b14
4083/// An 8-bit integral value used to initialize bits [143:136] of the result.
4084/// \param __b13
4085/// An 8-bit integral value used to initialize bits [151:144] of the result.
4086/// \param __b12
4087/// An 8-bit integral value used to initialize bits [159:152] of the result.
4088/// \param __b11
4089/// An 8-bit integral value used to initialize bits [167:160] of the result.
4090/// \param __b10
4091/// An 8-bit integral value used to initialize bits [175:168] of the result.
4092/// \param __b09
4093/// An 8-bit integral value used to initialize bits [183:176] of the result.
4094/// \param __b08
4095/// An 8-bit integral value used to initialize bits [191:184] of the result.
4096/// \param __b07
4097/// An 8-bit integral value used to initialize bits [199:192] of the result.
4098/// \param __b06
4099/// An 8-bit integral value used to initialize bits [207:200] of the result.
4100/// \param __b05
4101/// An 8-bit integral value used to initialize bits [215:208] of the result.
4102/// \param __b04
4103/// An 8-bit integral value used to initialize bits [223:216] of the result.
4104/// \param __b03
4105/// An 8-bit integral value used to initialize bits [231:224] of the result.
4106/// \param __b02
4107/// An 8-bit integral value used to initialize bits [239:232] of the result.
4108/// \param __b01
4109/// An 8-bit integral value used to initialize bits [247:240] of the result.
4110/// \param __b00
4111/// An 8-bit integral value used to initialize bits [255:248] of the result.
4112/// \returns An initialized 256-bit integer vector.
4113static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4114_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4115 char __b27, char __b26, char __b25, char __b24,
4116 char __b23, char __b22, char __b21, char __b20,
4117 char __b19, char __b18, char __b17, char __b16,
4118 char __b15, char __b14, char __b13, char __b12,
4119 char __b11, char __b10, char __b09, char __b08,
4120 char __b07, char __b06, char __b05, char __b04,
4121 char __b03, char __b02, char __b01, char __b00)
4122{
4123 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4124 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4125 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4126 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4127}
4128
4129/// Constructs a 256-bit integer vector, initialized in reverse order
4130/// with the specified 64-bit integral values.
4131///
4132/// \headerfile <x86intrin.h>
4133///
4134/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4135/// instruction.
4136///
4137/// \param __a
4138/// A 64-bit integral value used to initialize bits [63:0] of the result.
4139/// \param __b
4140/// A 64-bit integral value used to initialize bits [127:64] of the result.
4141/// \param __c
4142/// A 64-bit integral value used to initialize bits [191:128] of the result.
4143/// \param __d
4144/// A 64-bit integral value used to initialize bits [255:192] of the result.
4145/// \returns An initialized 256-bit integer vector.
4146static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4147_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4148{
4149 return _mm256_set_epi64x(__d, __c, __b, __a);
4150}
4151
4152/* Create vectors with repeated elements */
4153/// Constructs a 256-bit floating-point vector of [4 x double], with each
4154/// of the four double-precision floating-point vector elements set to the
4155/// specified double-precision floating-point value.
4156///
4157/// \headerfile <x86intrin.h>
4158///
4159/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4160///
4161/// \param __w
4162/// A double-precision floating-point value used to initialize each vector
4163/// element of the result.
4164/// \returns An initialized 256-bit floating-point vector of [4 x double].
4165static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4167{
4168 return _mm256_set_pd(__w, __w, __w, __w);
4169}
4170
4171/// Constructs a 256-bit floating-point vector of [8 x float], with each
4172/// of the eight single-precision floating-point vector elements set to the
4173/// specified single-precision floating-point value.
4174///
4175/// \headerfile <x86intrin.h>
4176///
4177/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4178/// instruction.
4179///
4180/// \param __w
4181/// A single-precision floating-point value used to initialize each vector
4182/// element of the result.
4183/// \returns An initialized 256-bit floating-point vector of [8 x float].
4184static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4186{
4187 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4188}
4189
4190/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4191/// 32-bit integral vector elements set to the specified 32-bit integral
4192/// value.
4193///
4194/// \headerfile <x86intrin.h>
4195///
4196/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4197/// instruction.
4198///
4199/// \param __i
4200/// A 32-bit integral value used to initialize each vector element of the
4201/// result.
4202/// \returns An initialized 256-bit integer vector of [8 x i32].
4203static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4205{
4206 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4207}
4208
4209/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4210/// 16-bit integral vector elements set to the specified 16-bit integral
4211/// value.
4212///
4213/// \headerfile <x86intrin.h>
4214///
4215/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4216///
4217/// \param __w
4218/// A 16-bit integral value used to initialize each vector element of the
4219/// result.
4220/// \returns An initialized 256-bit integer vector of [16 x i16].
4221static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4223{
4224 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4225 __w, __w, __w, __w, __w, __w, __w, __w);
4226}
4227
4228/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4229/// 8-bit integral vector elements set to the specified 8-bit integral value.
4230///
4231/// \headerfile <x86intrin.h>
4232///
4233/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4234///
4235/// \param __b
4236/// An 8-bit integral value used to initialize each vector element of the
4237/// result.
4238/// \returns An initialized 256-bit integer vector of [32 x i8].
4239static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4241{
4242 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4243 __b, __b, __b, __b, __b, __b, __b, __b,
4244 __b, __b, __b, __b, __b, __b, __b, __b,
4245 __b, __b, __b, __b, __b, __b, __b, __b);
4246}
4247
4248/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4249/// 64-bit integral vector elements set to the specified 64-bit integral
4250/// value.
4251///
4252/// \headerfile <x86intrin.h>
4253///
4254/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4255///
4256/// \param __q
4257/// A 64-bit integral value used to initialize each vector element of the
4258/// result.
4259/// \returns An initialized 256-bit integer vector of [4 x i64].
4260static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4262{
4263 return _mm256_set_epi64x(__q, __q, __q, __q);
4264}
4265
4266/* Create __zeroed vectors */
4267/// Constructs a 256-bit floating-point vector of [4 x double] with all
4268/// vector elements initialized to zero.
4269///
4270/// \headerfile <x86intrin.h>
4271///
4272/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4273///
4274/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4276 return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
4277}
4278
4279/// Constructs a 256-bit floating-point vector of [8 x float] with all
4280/// vector elements initialized to zero.
4281///
4282/// \headerfile <x86intrin.h>
4283///
4284/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4285///
4286/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4288 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4289}
4290
4291/// Constructs a 256-bit integer vector initialized to zero.
4292///
4293/// \headerfile <x86intrin.h>
4294///
4295/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4296///
4297/// \returns A 256-bit integer vector initialized to zero.
4298static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4300 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4301}
4302
4303/* Cast between vector types */
4304/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4305/// floating-point vector of [8 x float].
4306///
4307/// \headerfile <x86intrin.h>
4308///
4309/// This intrinsic has no corresponding instruction.
4310///
4311/// \param __a
4312/// A 256-bit floating-point vector of [4 x double].
4313/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4314/// bitwise pattern as the parameter.
4315static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4317{
4318 return (__m256)__a;
4319}
4320
4321/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4322/// integer vector.
4323///
4324/// \headerfile <x86intrin.h>
4325///
4326/// This intrinsic has no corresponding instruction.
4327///
4328/// \param __a
4329/// A 256-bit floating-point vector of [4 x double].
4330/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4331/// parameter.
4332static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4334{
4335 return (__m256i)__a;
4336}
4337
4338/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4339/// floating-point vector of [4 x double].
4340///
4341/// \headerfile <x86intrin.h>
4342///
4343/// This intrinsic has no corresponding instruction.
4344///
4345/// \param __a
4346/// A 256-bit floating-point vector of [8 x float].
4347/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4348/// bitwise pattern as the parameter.
4349static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4351{
4352 return (__m256d)__a;
4353}
4354
4355/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4356/// integer vector.
4357///
4358/// \headerfile <x86intrin.h>
4359///
4360/// This intrinsic has no corresponding instruction.
4361///
4362/// \param __a
4363/// A 256-bit floating-point vector of [8 x float].
4364/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4365/// parameter.
4366static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4368{
4369 return (__m256i)__a;
4370}
4371
4372/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4373/// of [8 x float].
4374///
4375/// \headerfile <x86intrin.h>
4376///
4377/// This intrinsic has no corresponding instruction.
4378///
4379/// \param __a
4380/// A 256-bit integer vector.
4381/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4382/// bitwise pattern as the parameter.
4383static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4385{
4386 return (__m256)__a;
4387}
4388
4389/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4390/// of [4 x double].
4391///
4392/// \headerfile <x86intrin.h>
4393///
4394/// This intrinsic has no corresponding instruction.
4395///
4396/// \param __a
4397/// A 256-bit integer vector.
4398/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4399/// bitwise pattern as the parameter.
4400static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4402{
4403 return (__m256d)__a;
4404}
4405
4406/// Returns the lower 128 bits of a 256-bit floating-point vector of
4407/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4408///
4409/// \headerfile <x86intrin.h>
4410///
4411/// This intrinsic has no corresponding instruction.
4412///
4413/// \param __a
4414/// A 256-bit floating-point vector of [4 x double].
4415/// \returns A 128-bit floating-point vector of [2 x double] containing the
4416/// lower 128 bits of the parameter.
4417static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4419{
4420 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4421}
4422
4423/// Returns the lower 128 bits of a 256-bit floating-point vector of
4424/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4425///
4426/// \headerfile <x86intrin.h>
4427///
4428/// This intrinsic has no corresponding instruction.
4429///
4430/// \param __a
4431/// A 256-bit floating-point vector of [8 x float].
4432/// \returns A 128-bit floating-point vector of [4 x float] containing the
4433/// lower 128 bits of the parameter.
4434static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4436{
4437 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4438}
4439
4440/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4441///
4442/// \headerfile <x86intrin.h>
4443///
4444/// This intrinsic has no corresponding instruction.
4445///
4446/// \param __a
4447/// A 256-bit integer vector.
4448/// \returns A 128-bit integer vector containing the lower 128 bits of the
4449/// parameter.
4450static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4452{
4453 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4454}
4455
4456/// Constructs a 256-bit floating-point vector of [4 x double] from a
4457/// 128-bit floating-point vector of [2 x double].
4458///
4459/// The lower 128 bits contain the value of the source vector. The contents
4460/// of the upper 128 bits are undefined.
4461///
4462/// \headerfile <x86intrin.h>
4463///
4464/// This intrinsic has no corresponding instruction.
4465///
4466/// \param __a
4467/// A 128-bit vector of [2 x double].
4468/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4469/// contain the value of the parameter. The contents of the upper 128 bits
4470/// are undefined.
4471static __inline __m256d __DEFAULT_FN_ATTRS
4473{
4474 return __builtin_shufflevector(
4475 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4476}
4477
4478/// Constructs a 256-bit floating-point vector of [8 x float] from a
4479/// 128-bit floating-point vector of [4 x float].
4480///
4481/// The lower 128 bits contain the value of the source vector. The contents
4482/// of the upper 128 bits are undefined.
4483///
4484/// \headerfile <x86intrin.h>
4485///
4486/// This intrinsic has no corresponding instruction.
4487///
4488/// \param __a
4489/// A 128-bit vector of [4 x float].
4490/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4491/// contain the value of the parameter. The contents of the upper 128 bits
4492/// are undefined.
4493static __inline __m256 __DEFAULT_FN_ATTRS
4495{
4496 return __builtin_shufflevector((__v4sf)__a,
4497 (__v4sf)__builtin_nondeterministic_value(__a),
4498 0, 1, 2, 3, 4, 5, 6, 7);
4499}
4500
4501/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4502///
4503/// The lower 128 bits contain the value of the source vector. The contents
4504/// of the upper 128 bits are undefined.
4505///
4506/// \headerfile <x86intrin.h>
4507///
4508/// This intrinsic has no corresponding instruction.
4509///
4510/// \param __a
4511/// A 128-bit integer vector.
4512/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4513/// the parameter. The contents of the upper 128 bits are undefined.
4514static __inline __m256i __DEFAULT_FN_ATTRS
4516{
4517 return __builtin_shufflevector(
4518 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4519}
4520
4521/// Constructs a 256-bit floating-point vector of [4 x double] from a
4522/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4523/// contain the value of the source vector. The upper 128 bits are set
4524/// to zero.
4525///
4526/// \headerfile <x86intrin.h>
4527///
4528/// This intrinsic has no corresponding instruction.
4529///
4530/// \param __a
4531/// A 128-bit vector of [2 x double].
4532/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4533/// contain the value of the parameter. The upper 128 bits are set to zero.
4534static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4536 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4537}
4538
4539/// Constructs a 256-bit floating-point vector of [8 x float] from a
4540/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4541/// the value of the source vector. The upper 128 bits are set to zero.
4542///
4543/// \headerfile <x86intrin.h>
4544///
4545/// This intrinsic has no corresponding instruction.
4546///
4547/// \param __a
4548/// A 128-bit vector of [4 x float].
4549/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4550/// contain the value of the parameter. The upper 128 bits are set to zero.
4551static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4553 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4554}
4555
4556/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4557/// The lower 128 bits contain the value of the source vector. The upper
4558/// 128 bits are set to zero.
4559///
4560/// \headerfile <x86intrin.h>
4561///
4562/// This intrinsic has no corresponding instruction.
4563///
4564/// \param __a
4565/// A 128-bit integer vector.
4566/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4567/// the parameter. The upper 128 bits are set to zero.
4568static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4570 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4571}
4572
4573/*
4574 Vector insert.
4575 We use macros rather than inlines because we only want to accept
4576 invocations where the immediate M is a constant expression.
4577*/
4578/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4579/// a 256-bit vector of [8 x float] given in the first parameter, and then
4580/// replacing either the upper or the lower 128 bits with the contents of a
4581/// 128-bit vector of [4 x float] in the second parameter.
4582///
4583/// The immediate integer parameter determines between the upper or the lower
4584/// 128 bits.
4585///
4586/// \headerfile <x86intrin.h>
4587///
4588/// \code
4589/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4590/// \endcode
4591///
4592/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4593///
4594/// \param V1
4595/// A 256-bit vector of [8 x float]. This vector is copied to the result
4596/// first, and then either the upper or the lower 128 bits of the result will
4597/// be replaced by the contents of \a V2.
4598/// \param V2
4599/// A 128-bit vector of [4 x float]. The contents of this parameter are
4600/// written to either the upper or the lower 128 bits of the result depending
4601/// on the value of parameter \a M.
4602/// \param M
4603/// An immediate integer. The least significant bit determines how the values
4604/// from the two parameters are interleaved: \n
4605/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4606/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4607/// result. \n
4608/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4609/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4610/// result.
4611/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4612#define _mm256_insertf128_ps(V1, V2, M) \
4613 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4614 (__v4sf)(__m128)(V2), (int)(M)))
4615
4616/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4617/// a 256-bit vector of [4 x double] given in the first parameter, and then
4618/// replacing either the upper or the lower 128 bits with the contents of a
4619/// 128-bit vector of [2 x double] in the second parameter.
4620///
4621/// The immediate integer parameter determines between the upper or the lower
4622/// 128 bits.
4623///
4624/// \headerfile <x86intrin.h>
4625///
4626/// \code
4627/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4628/// \endcode
4629///
4630/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4631///
4632/// \param V1
4633/// A 256-bit vector of [4 x double]. This vector is copied to the result
4634/// first, and then either the upper or the lower 128 bits of the result will
4635/// be replaced by the contents of \a V2.
4636/// \param V2
4637/// A 128-bit vector of [2 x double]. The contents of this parameter are
4638/// written to either the upper or the lower 128 bits of the result depending
4639/// on the value of parameter \a M.
4640/// \param M
4641/// An immediate integer. The least significant bit determines how the values
4642/// from the two parameters are interleaved: \n
4643/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4644/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4645/// result. \n
4646/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4647/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4648/// result.
4649/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4650#define _mm256_insertf128_pd(V1, V2, M) \
4651 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4652 (__v2df)(__m128d)(V2), (int)(M)))
4653
4654/// Constructs a new 256-bit integer vector by first duplicating a
4655/// 256-bit integer vector given in the first parameter, and then replacing
4656/// either the upper or the lower 128 bits with the contents of a 128-bit
4657/// integer vector in the second parameter.
4658///
4659/// The immediate integer parameter determines between the upper or the lower
4660/// 128 bits.
4661///
4662/// \headerfile <x86intrin.h>
4663///
4664/// \code
4665/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4666/// \endcode
4667///
4668/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4669///
4670/// \param V1
4671/// A 256-bit integer vector. This vector is copied to the result first, and
4672/// then either the upper or the lower 128 bits of the result will be
4673/// replaced by the contents of \a V2.
4674/// \param V2
4675/// A 128-bit integer vector. The contents of this parameter are written to
4676/// either the upper or the lower 128 bits of the result depending on the
4677/// value of parameter \a M.
4678/// \param M
4679/// An immediate integer. The least significant bit determines how the values
4680/// from the two parameters are interleaved: \n
4681/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4682/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4683/// result. \n
4684/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4685/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4686/// result.
4687/// \returns A 256-bit integer vector containing the interleaved values.
4688#define _mm256_insertf128_si256(V1, V2, M) \
4689 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4690 (__v4si)(__m128i)(V2), (int)(M)))
4691
4692/*
4693 Vector extract.
4694 We use macros rather than inlines because we only want to accept
4695 invocations where the immediate M is a constant expression.
4696*/
4697/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4698/// of [8 x float], as determined by the immediate integer parameter, and
4699/// returns the extracted bits as a 128-bit vector of [4 x float].
4700///
4701/// \headerfile <x86intrin.h>
4702///
4703/// \code
4704/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4705/// \endcode
4706///
4707/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4708///
4709/// \param V
4710/// A 256-bit vector of [8 x float].
4711/// \param M
4712/// An immediate integer. The least significant bit determines which bits are
4713/// extracted from the first parameter: \n
4714/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4715/// result. \n
4716/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4717/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4718#define _mm256_extractf128_ps(V, M) \
4719 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4720
4721/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4722/// of [4 x double], as determined by the immediate integer parameter, and
4723/// returns the extracted bits as a 128-bit vector of [2 x double].
4724///
4725/// \headerfile <x86intrin.h>
4726///
4727/// \code
4728/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4729/// \endcode
4730///
4731/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4732///
4733/// \param V
4734/// A 256-bit vector of [4 x double].
4735/// \param M
4736/// An immediate integer. The least significant bit determines which bits are
4737/// extracted from the first parameter: \n
4738/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4739/// result. \n
4740/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4741/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4742#define _mm256_extractf128_pd(V, M) \
4743 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4744
4745/// Extracts either the upper or the lower 128 bits from a 256-bit
4746/// integer vector, as determined by the immediate integer parameter, and
4747/// returns the extracted bits as a 128-bit integer vector.
4748///
4749/// \headerfile <x86intrin.h>
4750///
4751/// \code
4752/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4753/// \endcode
4754///
4755/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4756///
4757/// \param V
4758/// A 256-bit integer vector.
4759/// \param M
4760/// An immediate integer. The least significant bit determines which bits are
4761/// extracted from the first parameter: \n
4762/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4763/// result. \n
4764/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4765/// \returns A 128-bit integer vector containing the extracted bits.
4766#define _mm256_extractf128_si256(V, M) \
4767 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4768
4769/// Constructs a 256-bit floating-point vector of [8 x float] by
4770/// concatenating two 128-bit floating-point vectors of [4 x float].
4771///
4772/// \headerfile <x86intrin.h>
4773///
4774/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4775///
4776/// \param __hi
4777/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4778/// 128 bits of the result.
4779/// \param __lo
4780/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4781/// 128 bits of the result.
4782/// \returns A 256-bit floating-point vector of [8 x float] containing the
4783/// concatenated result.
4784static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4785_mm256_set_m128(__m128 __hi, __m128 __lo) {
4786 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4787}
4788
4789/// Constructs a 256-bit floating-point vector of [4 x double] by
4790/// concatenating two 128-bit floating-point vectors of [2 x double].
4791///
4792/// \headerfile <x86intrin.h>
4793///
4794/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4795///
4796/// \param __hi
4797/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4798/// 128 bits of the result.
4799/// \param __lo
4800/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4801/// 128 bits of the result.
4802/// \returns A 256-bit floating-point vector of [4 x double] containing the
4803/// concatenated result.
4804static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4805_mm256_set_m128d(__m128d __hi, __m128d __lo) {
4806 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4807}
4808
4809/// Constructs a 256-bit integer vector by concatenating two 128-bit
4810/// integer vectors.
4811///
4812/// \headerfile <x86intrin.h>
4813///
4814/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4815///
4816/// \param __hi
4817/// A 128-bit integer vector to be copied to the upper 128 bits of the
4818/// result.
4819/// \param __lo
4820/// A 128-bit integer vector to be copied to the lower 128 bits of the
4821/// result.
4822/// \returns A 256-bit integer vector containing the concatenated result.
4823static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4824_mm256_set_m128i(__m128i __hi, __m128i __lo) {
4825 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4826}
4827
4828/// Constructs a 256-bit floating-point vector of [8 x float] by
4829/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4830/// similar to _mm256_set_m128, but the order of the input parameters is
4831/// swapped.
4832///
4833/// \headerfile <x86intrin.h>
4834///
4835/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4836///
4837/// \param __lo
4838/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4839/// 128 bits of the result.
4840/// \param __hi
4841/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4842/// 128 bits of the result.
4843/// \returns A 256-bit floating-point vector of [8 x float] containing the
4844/// concatenated result.
4845static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4846_mm256_setr_m128(__m128 __lo, __m128 __hi) {
4847 return _mm256_set_m128(__hi, __lo);
4848}
4849
4850/// Constructs a 256-bit floating-point vector of [4 x double] by
4851/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4852/// similar to _mm256_set_m128d, but the order of the input parameters is
4853/// swapped.
4854///
4855/// \headerfile <x86intrin.h>
4856///
4857/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4858///
4859/// \param __lo
4860/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4861/// 128 bits of the result.
4862/// \param __hi
4863/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4864/// 128 bits of the result.
4865/// \returns A 256-bit floating-point vector of [4 x double] containing the
4866/// concatenated result.
4867static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4868_mm256_setr_m128d(__m128d __lo, __m128d __hi) {
4869 return (__m256d)_mm256_set_m128d(__hi, __lo);
4870}
4871
4872/// Constructs a 256-bit integer vector by concatenating two 128-bit
4873/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4874/// the input parameters is swapped.
4875///
4876/// \headerfile <x86intrin.h>
4877///
4878/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4879///
4880/// \param __lo
4881/// A 128-bit integer vector to be copied to the lower 128 bits of the
4882/// result.
4883/// \param __hi
4884/// A 128-bit integer vector to be copied to the upper 128 bits of the
4885/// result.
4886/// \returns A 256-bit integer vector containing the concatenated result.
4887static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4888_mm256_setr_m128i(__m128i __lo, __m128i __hi) {
4889 return (__m256i)_mm256_set_m128i(__hi, __lo);
4890}
4891
4892/* SIMD load ops (unaligned) */
4893/// Loads two 128-bit floating-point vectors of [4 x float] from
4894/// unaligned memory locations and constructs a 256-bit floating-point vector
4895/// of [8 x float] by concatenating the two 128-bit vectors.
4896///
4897/// \headerfile <x86intrin.h>
4898///
4899/// This intrinsic corresponds to load instructions followed by the
4900/// <c> VINSERTF128 </c> instruction.
4901///
4902/// \param __addr_hi
4903/// A pointer to a 128-bit memory location containing 4 consecutive
4904/// single-precision floating-point values. These values are to be copied to
4905/// bits[255:128] of the result. The address of the memory location does not
4906/// have to be aligned.
4907/// \param __addr_lo
4908/// A pointer to a 128-bit memory location containing 4 consecutive
4909/// single-precision floating-point values. These values are to be copied to
4910/// bits[127:0] of the result. The address of the memory location does not
4911/// have to be aligned.
4912/// \returns A 256-bit floating-point vector of [8 x float] containing the
4913/// concatenated result.
4914static __inline __m256 __DEFAULT_FN_ATTRS
4915_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4916{
4917 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4918}
4919
4920/// Loads two 128-bit floating-point vectors of [2 x double] from
4921/// unaligned memory locations and constructs a 256-bit floating-point vector
4922/// of [4 x double] by concatenating the two 128-bit vectors.
4923///
4924/// \headerfile <x86intrin.h>
4925///
4926/// This intrinsic corresponds to load instructions followed by the
4927/// <c> VINSERTF128 </c> instruction.
4928///
4929/// \param __addr_hi
4930/// A pointer to a 128-bit memory location containing two consecutive
4931/// double-precision floating-point values. These values are to be copied to
4932/// bits[255:128] of the result. The address of the memory location does not
4933/// have to be aligned.
4934/// \param __addr_lo
4935/// A pointer to a 128-bit memory location containing two consecutive
4936/// double-precision floating-point values. These values are to be copied to
4937/// bits[127:0] of the result. The address of the memory location does not
4938/// have to be aligned.
4939/// \returns A 256-bit floating-point vector of [4 x double] containing the
4940/// concatenated result.
4941static __inline __m256d __DEFAULT_FN_ATTRS
4942_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4943{
4944 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
4945}
4946
4947/// Loads two 128-bit integer vectors from unaligned memory locations and
4948/// constructs a 256-bit integer vector by concatenating the two 128-bit
4949/// vectors.
4950///
4951/// \headerfile <x86intrin.h>
4952///
4953/// This intrinsic corresponds to load instructions followed by the
4954/// <c> VINSERTF128 </c> instruction.
4955///
4956/// \param __addr_hi
4957/// A pointer to a 128-bit memory location containing a 128-bit integer
4958/// vector. This vector is to be copied to bits[255:128] of the result. The
4959/// address of the memory location does not have to be aligned.
4960/// \param __addr_lo
4961/// A pointer to a 128-bit memory location containing a 128-bit integer
4962/// vector. This vector is to be copied to bits[127:0] of the result. The
4963/// address of the memory location does not have to be aligned.
4964/// \returns A 256-bit integer vector containing the concatenated result.
4965static __inline __m256i __DEFAULT_FN_ATTRS
4966_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
4967{
4968 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
4969}
4970
4971/* SIMD store ops (unaligned) */
4972/// Stores the upper and lower 128 bits of a 256-bit floating-point
4973/// vector of [8 x float] into two different unaligned memory locations.
4974///
4975/// \headerfile <x86intrin.h>
4976///
4977/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4978/// store instructions.
4979///
4980/// \param __addr_hi
4981/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4982/// copied to this memory location. The address of this memory location does
4983/// not have to be aligned.
4984/// \param __addr_lo
4985/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4986/// copied to this memory location. The address of this memory location does
4987/// not have to be aligned.
4988/// \param __a
4989/// A 256-bit floating-point vector of [8 x float].
4990static __inline void __DEFAULT_FN_ATTRS
4991_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
4992{
4993 __m128 __v128;
4994
4995 __v128 = _mm256_castps256_ps128(__a);
4996 _mm_storeu_ps(__addr_lo, __v128);
4997 __v128 = _mm256_extractf128_ps(__a, 1);
4998 _mm_storeu_ps(__addr_hi, __v128);
4999}
5000
5001/// Stores the upper and lower 128 bits of a 256-bit floating-point
5002/// vector of [4 x double] into two different unaligned memory locations.
5003///
5004/// \headerfile <x86intrin.h>
5005///
5006/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5007/// store instructions.
5008///
5009/// \param __addr_hi
5010/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5011/// copied to this memory location. The address of this memory location does
5012/// not have to be aligned.
5013/// \param __addr_lo
5014/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5015/// copied to this memory location. The address of this memory location does
5016/// not have to be aligned.
5017/// \param __a
5018/// A 256-bit floating-point vector of [4 x double].
5019static __inline void __DEFAULT_FN_ATTRS
5020_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5021{
5022 __m128d __v128;
5023
5024 __v128 = _mm256_castpd256_pd128(__a);
5025 _mm_storeu_pd(__addr_lo, __v128);
5026 __v128 = _mm256_extractf128_pd(__a, 1);
5027 _mm_storeu_pd(__addr_hi, __v128);
5028}
5029
5030/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5031/// two different unaligned memory locations.
5032///
5033/// \headerfile <x86intrin.h>
5034///
5035/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5036/// store instructions.
5037///
5038/// \param __addr_hi
5039/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5040/// copied to this memory location. The address of this memory location does
5041/// not have to be aligned.
5042/// \param __addr_lo
5043/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5044/// copied to this memory location. The address of this memory location does
5045/// not have to be aligned.
5046/// \param __a
5047/// A 256-bit integer vector.
5048static __inline void __DEFAULT_FN_ATTRS
5049_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5050{
5051 __m128i __v128;
5052
5053 __v128 = _mm256_castsi256_si128(__a);
5054 _mm_storeu_si128(__addr_lo, __v128);
5055 __v128 = _mm256_extractf128_si256(__a, 1);
5056 _mm_storeu_si128(__addr_hi, __v128);
5057}
5058
5059#undef __DEFAULT_FN_ATTRS
5060#undef __DEFAULT_FN_ATTRS_CONSTEXPR
5061#undef __DEFAULT_FN_ATTRS128
5062#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5063
5064#endif /* __AVXINTRIN_H */
__device__ _Float16
#define __DEFAULT_FN_ATTRS
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3010
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition avxintrin.h:169
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition avxintrin.h:3054
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition avxintrin.h:3260
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition avxintrin.h:821
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition avxintrin.h:3556
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition avxintrin.h:2930
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition avxintrin.h:3074
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4535
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition avxintrin.h:2260
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition avxintrin.h:3224
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition avxintrin.h:3280
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition avxintrin.h:2947
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition avxintrin.h:4915
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:347
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3370
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition avxintrin.h:571
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2581
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition avxintrin.h:3714
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4552
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2665
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition avxintrin.h:116
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:380
static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition avxintrin.h:2186
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition avxintrin.h:3604
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition avxintrin.h:300
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition avxintrin.h:965
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition avxintrin.h:263
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition avxintrin.h:3949
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4846
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3395
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3346
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition avxintrin.h:4333
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition avxintrin.h:242
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition avxintrin.h:186
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4888
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition avxintrin.h:3318
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition avxintrin.h:4718
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition avxintrin.h:4766
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition avxintrin.h:3167
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition avxintrin.h:4384
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition avxintrin.h:4316
static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition avxintrin.h:2327
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition avxintrin.h:3643
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition avxintrin.h:2352
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4868
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition avxintrin.h:132
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition avxintrin.h:1388
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition avxintrin.h:3536
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition avxintrin.h:3592
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:753
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:363
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition avxintrin.h:3762
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition avxintrin.h:2171
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition avxintrin.h:592
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition avxintrin.h:3468
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4472
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition avxintrin.h:4166
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2495
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into four signed truncated (rounded toward zero) 32-bit int...
Definition avxintrin.h:2240
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition avxintrin.h:3616
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition avxintrin.h:2220
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition avxintrin.h:2204
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition avxintrin.h:4185
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2469
static __inline __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition avxintrin.h:783
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3110
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition avxintrin.h:2157
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2861
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2608
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition avxintrin.h:2311
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition avxintrin.h:4742
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into eight signed truncated (rounded toward zero) 32-bit integers re...
Definition avxintrin.h:2280
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4494
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3032
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2693
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition avxintrin.h:4287
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition avxintrin.h:4204
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition avxintrin.h:3845
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2636
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition avxintrin.h:4942
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition avxintrin.h:82
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2912
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:664
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition avxintrin.h:5020
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition avxintrin.h:332
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2751
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2778
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:688
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition avxintrin.h:4147
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition avxintrin.h:4261
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition avxintrin.h:3492
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition avxintrin.h:3444
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition avxintrin.h:4114
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition avxintrin.h:4401
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3127
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition avxintrin.h:223
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition avxintrin.h:3682
static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition avxintrin.h:4418
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4569
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition avxintrin.h:98
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition avxintrin.h:4029
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition avxintrin.h:2377
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition avxintrin.h:2399
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition avxintrin.h:5049
static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition avxintrin.h:4435
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:610
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition avxintrin.h:4367
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2806
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:709
static __inline __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition avxintrin.h:875
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:646
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:628
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition avxintrin.h:151
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:732
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition avxintrin.h:1415
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition avxintrin.h:4966
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2523
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2835
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition avxintrin.h:4275
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition avxintrin.h:3880
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3419
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition avxintrin.h:316
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition avxintrin.h:3577
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition avxintrin.h:550
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3183
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition avxintrin.h:3301
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4785
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition avxintrin.h:282
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition avxintrin.h:4350
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition avxintrin.h:4299
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4515
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition avxintrin.h:202
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3147
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:2988
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition avxintrin.h:3981
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2551
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition avxintrin.h:2443
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition avxintrin.h:2422
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition avxintrin.h:4222
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition avxintrin.h:4240
static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition avxintrin.h:4451
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2721
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3204
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2886
static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition avxintrin.h:2296
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4824
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition avxintrin.h:3242
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition avxintrin.h:3909
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition avxintrin.h:4991
double __v4df __attribute__((__vector_size__(32)))
Definition avxintrin.h:17
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition avxintrin.h:3516
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4805
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3094
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition avxintrin.h:532
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition emmintrin.h:1619
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3878
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition emmintrin.h:3456
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition emmintrin.h:1980
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition emmintrin.h:3909
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:2091
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2012
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1854