clang 22.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
55 __min_vector_width__(256)))
56#define __DEFAULT_FN_ATTRS128 \
57 __attribute__((__always_inline__, __nodebug__, __target__("avx"), \
58 __min_vector_width__(128)))
59
60#if defined(__cplusplus) && (__cplusplus >= 201103L)
61#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
62#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
63#else
64#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
65#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
66#endif
67
68/* Arithmetic */
69/// Adds two 256-bit vectors of [4 x double].
70///
71/// \headerfile <x86intrin.h>
72///
73/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
74///
75/// \param __a
76/// A 256-bit vector of [4 x double] containing one of the source operands.
77/// \param __b
78/// A 256-bit vector of [4 x double] containing one of the source operands.
79/// \returns A 256-bit vector of [4 x double] containing the sums of both
80/// operands.
81static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
82_mm256_add_pd(__m256d __a, __m256d __b) {
83 return (__m256d)((__v4df)__a+(__v4df)__b);
84}
85
86/// Adds two 256-bit vectors of [8 x float].
87///
88/// \headerfile <x86intrin.h>
89///
90/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
91///
92/// \param __a
93/// A 256-bit vector of [8 x float] containing one of the source operands.
94/// \param __b
95/// A 256-bit vector of [8 x float] containing one of the source operands.
96/// \returns A 256-bit vector of [8 x float] containing the sums of both
97/// operands.
98static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a,
99 __m256 __b) {
100 return (__m256)((__v8sf)__a+(__v8sf)__b);
101}
102
103/// Subtracts two 256-bit vectors of [4 x double].
104///
105/// \headerfile <x86intrin.h>
106///
107/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
108///
109/// \param __a
110/// A 256-bit vector of [4 x double] containing the minuend.
111/// \param __b
112/// A 256-bit vector of [4 x double] containing the subtrahend.
113/// \returns A 256-bit vector of [4 x double] containing the differences between
114/// both operands.
115static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
116_mm256_sub_pd(__m256d __a, __m256d __b) {
117 return (__m256d)((__v4df)__a-(__v4df)__b);
118}
119
120/// Subtracts two 256-bit vectors of [8 x float].
121///
122/// \headerfile <x86intrin.h>
123///
124/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
125///
126/// \param __a
127/// A 256-bit vector of [8 x float] containing the minuend.
128/// \param __b
129/// A 256-bit vector of [8 x float] containing the subtrahend.
130/// \returns A 256-bit vector of [8 x float] containing the differences between
131/// both operands.
132static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a,
133 __m256 __b) {
134 return (__m256)((__v8sf)__a-(__v8sf)__b);
135}
136
137/// Adds the even-indexed values and subtracts the odd-indexed values of
138/// two 256-bit vectors of [4 x double].
139///
140/// \headerfile <x86intrin.h>
141///
142/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
143///
144/// \param __a
145/// A 256-bit vector of [4 x double] containing the left source operand.
146/// \param __b
147/// A 256-bit vector of [4 x double] containing the right source operand.
148/// \returns A 256-bit vector of [4 x double] containing the alternating sums
149/// and differences between both operands.
150static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
151_mm256_addsub_pd(__m256d __a, __m256d __b) {
152 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
153}
154
155/// Adds the even-indexed values and subtracts the odd-indexed values of
156/// two 256-bit vectors of [8 x float].
157///
158/// \headerfile <x86intrin.h>
159///
160/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
161///
162/// \param __a
163/// A 256-bit vector of [8 x float] containing the left source operand.
164/// \param __b
165/// A 256-bit vector of [8 x float] containing the right source operand.
166/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
167/// differences between both operands.
168static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
169_mm256_addsub_ps(__m256 __a, __m256 __b) {
170 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
171}
172
173/// Divides two 256-bit vectors of [4 x double].
174///
175/// \headerfile <x86intrin.h>
176///
177/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
178///
179/// \param __a
180/// A 256-bit vector of [4 x double] containing the dividend.
181/// \param __b
182/// A 256-bit vector of [4 x double] containing the divisor.
183/// \returns A 256-bit vector of [4 x double] containing the quotients of both
184/// operands.
185static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
186_mm256_div_pd(__m256d __a, __m256d __b) {
187 return (__m256d)((__v4df)__a/(__v4df)__b);
188}
189
190/// Divides two 256-bit vectors of [8 x float].
191///
192/// \headerfile <x86intrin.h>
193///
194/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
195///
196/// \param __a
197/// A 256-bit vector of [8 x float] containing the dividend.
198/// \param __b
199/// A 256-bit vector of [8 x float] containing the divisor.
200/// \returns A 256-bit vector of [8 x float] containing the quotients of both
201/// operands.
202static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a,
203 __m256 __b) {
204 return (__m256)((__v8sf)__a/(__v8sf)__b);
205}
206
207/// Compares two 256-bit vectors of [4 x double] and returns the greater
208/// of each pair of values.
209///
210/// If either value in a comparison is NaN, returns the value from \a __b.
211///
212/// \headerfile <x86intrin.h>
213///
214/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
215///
216/// \param __a
217/// A 256-bit vector of [4 x double] containing one of the operands.
218/// \param __b
219/// A 256-bit vector of [4 x double] containing one of the operands.
220/// \returns A 256-bit vector of [4 x double] containing the maximum values
221/// between both operands.
222static __inline __m256d __DEFAULT_FN_ATTRS
223_mm256_max_pd(__m256d __a, __m256d __b)
224{
225 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
226}
227
228/// Compares two 256-bit vectors of [8 x float] and returns the greater
229/// of each pair of values.
230///
231/// If either value in a comparison is NaN, returns the value from \a __b.
232///
233/// \headerfile <x86intrin.h>
234///
235/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
236///
237/// \param __a
238/// A 256-bit vector of [8 x float] containing one of the operands.
239/// \param __b
240/// A 256-bit vector of [8 x float] containing one of the operands.
241/// \returns A 256-bit vector of [8 x float] containing the maximum values
242/// between both operands.
243static __inline __m256 __DEFAULT_FN_ATTRS
244_mm256_max_ps(__m256 __a, __m256 __b)
245{
246 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
247}
248
249/// Compares two 256-bit vectors of [4 x double] and returns the lesser
250/// of each pair of values.
251///
252/// If either value in a comparison is NaN, returns the value from \a __b.
253///
254/// \headerfile <x86intrin.h>
255///
256/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
257///
258/// \param __a
259/// A 256-bit vector of [4 x double] containing one of the operands.
260/// \param __b
261/// A 256-bit vector of [4 x double] containing one of the operands.
262/// \returns A 256-bit vector of [4 x double] containing the minimum values
263/// between both operands.
264static __inline __m256d __DEFAULT_FN_ATTRS
265_mm256_min_pd(__m256d __a, __m256d __b)
266{
267 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
268}
269
270/// Compares two 256-bit vectors of [8 x float] and returns the lesser
271/// of each pair of values.
272///
273/// If either value in a comparison is NaN, returns the value from \a __b.
274///
275/// \headerfile <x86intrin.h>
276///
277/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
278///
279/// \param __a
280/// A 256-bit vector of [8 x float] containing one of the operands.
281/// \param __b
282/// A 256-bit vector of [8 x float] containing one of the operands.
283/// \returns A 256-bit vector of [8 x float] containing the minimum values
284/// between both operands.
285static __inline __m256 __DEFAULT_FN_ATTRS
286_mm256_min_ps(__m256 __a, __m256 __b)
287{
288 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
289}
290
291/// Multiplies two 256-bit vectors of [4 x double].
292///
293/// \headerfile <x86intrin.h>
294///
295/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
296///
297/// \param __a
298/// A 256-bit vector of [4 x double] containing one of the operands.
299/// \param __b
300/// A 256-bit vector of [4 x double] containing one of the operands.
301/// \returns A 256-bit vector of [4 x double] containing the products of both
302/// operands.
303static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
304_mm256_mul_pd(__m256d __a, __m256d __b) {
305 return (__m256d)((__v4df)__a * (__v4df)__b);
306}
307
308/// Multiplies two 256-bit vectors of [8 x float].
309///
310/// \headerfile <x86intrin.h>
311///
312/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
313///
314/// \param __a
315/// A 256-bit vector of [8 x float] containing one of the operands.
316/// \param __b
317/// A 256-bit vector of [8 x float] containing one of the operands.
318/// \returns A 256-bit vector of [8 x float] containing the products of both
319/// operands.
320static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
321 __m256 __b) {
322 return (__m256)((__v8sf)__a * (__v8sf)__b);
323}
324
325/// Calculates the square roots of the values in a 256-bit vector of
326/// [4 x double].
327///
328/// \headerfile <x86intrin.h>
329///
330/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
331///
332/// \param __a
333/// A 256-bit vector of [4 x double].
334/// \returns A 256-bit vector of [4 x double] containing the square roots of the
335/// values in the operand.
336static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
337 return __builtin_elementwise_sqrt(__a);
338}
339
340/// Calculates the square roots of the values in a 256-bit vector of
341/// [8 x float].
342///
343/// \headerfile <x86intrin.h>
344///
345/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
346///
347/// \param __a
348/// A 256-bit vector of [8 x float].
349/// \returns A 256-bit vector of [8 x float] containing the square roots of the
350/// values in the operand.
351static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
352 return __builtin_elementwise_sqrt(__a);
353}
354
355/// Calculates the reciprocal square roots of the values in a 256-bit
356/// vector of [8 x float].
357///
358/// \headerfile <x86intrin.h>
359///
360/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
361///
362/// \param __a
363/// A 256-bit vector of [8 x float].
364/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
365/// roots of the values in the operand.
366static __inline __m256 __DEFAULT_FN_ATTRS
368{
369 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
370}
371
372/// Calculates the reciprocals of the values in a 256-bit vector of
373/// [8 x float].
374///
375/// \headerfile <x86intrin.h>
376///
377/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
378///
379/// \param __a
380/// A 256-bit vector of [8 x float].
381/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
382/// values in the operand.
383static __inline __m256 __DEFAULT_FN_ATTRS
385{
386 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
387}
388
389/// Rounds the values in a 256-bit vector of [4 x double] as specified
390/// by the byte operand. The source values are rounded to integer values and
391/// returned as 64-bit double-precision floating-point values.
392///
393/// \headerfile <x86intrin.h>
394///
395/// \code
396/// __m256d _mm256_round_pd(__m256d V, const int M);
397/// \endcode
398///
399/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
400///
401/// \param V
402/// A 256-bit vector of [4 x double].
403/// \param M
404/// An integer value that specifies the rounding operation. \n
405/// Bits [7:4] are reserved. \n
406/// Bit [3] is a precision exception value: \n
407/// 0: A normal PE exception is used. \n
408/// 1: The PE field is not updated. \n
409/// Bit [2] is the rounding control source: \n
410/// 0: Use bits [1:0] of \a M. \n
411/// 1: Use the current MXCSR setting. \n
412/// Bits [1:0] contain the rounding control definition: \n
413/// 00: Nearest. \n
414/// 01: Downward (toward negative infinity). \n
415/// 10: Upward (toward positive infinity). \n
416/// 11: Truncated.
417/// \returns A 256-bit vector of [4 x double] containing the rounded values.
418#define _mm256_round_pd(V, M) \
419 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
420
421/// Rounds the values stored in a 256-bit vector of [8 x float] as
422/// specified by the byte operand. The source values are rounded to integer
423/// values and returned as floating-point values.
424///
425/// \headerfile <x86intrin.h>
426///
427/// \code
428/// __m256 _mm256_round_ps(__m256 V, const int M);
429/// \endcode
430///
431/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
432///
433/// \param V
434/// A 256-bit vector of [8 x float].
435/// \param M
436/// An integer value that specifies the rounding operation. \n
437/// Bits [7:4] are reserved. \n
438/// Bit [3] is a precision exception value: \n
439/// 0: A normal PE exception is used. \n
440/// 1: The PE field is not updated. \n
441/// Bit [2] is the rounding control source: \n
442/// 0: Use bits [1:0] of \a M. \n
443/// 1: Use the current MXCSR setting. \n
444/// Bits [1:0] contain the rounding control definition: \n
445/// 00: Nearest. \n
446/// 01: Downward (toward negative infinity). \n
447/// 10: Upward (toward positive infinity). \n
448/// 11: Truncated.
449/// \returns A 256-bit vector of [8 x float] containing the rounded values.
450#define _mm256_round_ps(V, M) \
451 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
452
453/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
454/// source values are rounded up to integer values and returned as 64-bit
455/// double-precision floating-point values.
456///
457/// \headerfile <x86intrin.h>
458///
459/// \code
460/// __m256d _mm256_ceil_pd(__m256d V);
461/// \endcode
462///
463/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
464///
465/// \param V
466/// A 256-bit vector of [4 x double].
467/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
468#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
469
470/// Rounds down the values stored in a 256-bit vector of [4 x double].
471/// The source values are rounded down to integer values and returned as
472/// 64-bit double-precision floating-point values.
473///
474/// \headerfile <x86intrin.h>
475///
476/// \code
477/// __m256d _mm256_floor_pd(__m256d V);
478/// \endcode
479///
480/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
481///
482/// \param V
483/// A 256-bit vector of [4 x double].
484/// \returns A 256-bit vector of [4 x double] containing the rounded down
485/// values.
486#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
487
488/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
489/// source values are rounded up to integer values and returned as
490/// floating-point values.
491///
492/// \headerfile <x86intrin.h>
493///
494/// \code
495/// __m256 _mm256_ceil_ps(__m256 V);
496/// \endcode
497///
498/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
499///
500/// \param V
501/// A 256-bit vector of [8 x float].
502/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
503#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
504
505/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
506/// source values are rounded down to integer values and returned as
507/// floating-point values.
508///
509/// \headerfile <x86intrin.h>
510///
511/// \code
512/// __m256 _mm256_floor_ps(__m256 V);
513/// \endcode
514///
515/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
516///
517/// \param V
518/// A 256-bit vector of [8 x float].
519/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
520#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
521
522/* Logical */
523/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
524///
525/// \headerfile <x86intrin.h>
526///
527/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
528///
529/// \param __a
530/// A 256-bit vector of [4 x double] containing one of the source operands.
531/// \param __b
532/// A 256-bit vector of [4 x double] containing one of the source operands.
533/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
534/// values between both operands.
535static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
536_mm256_and_pd(__m256d __a, __m256d __b)
537{
538 return (__m256d)((__v4du)__a & (__v4du)__b);
539}
540
541/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
542///
543/// \headerfile <x86intrin.h>
544///
545/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
546///
547/// \param __a
548/// A 256-bit vector of [8 x float] containing one of the source operands.
549/// \param __b
550/// A 256-bit vector of [8 x float] containing one of the source operands.
551/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
552/// values between both operands.
553static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
554_mm256_and_ps(__m256 __a, __m256 __b)
555{
556 return (__m256)((__v8su)__a & (__v8su)__b);
557}
558
559/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
560/// the one's complement of the values contained in the first source operand.
561///
562/// \headerfile <x86intrin.h>
563///
564/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
565///
566/// \param __a
567/// A 256-bit vector of [4 x double] containing the left source operand. The
568/// one's complement of this value is used in the bitwise AND.
569/// \param __b
570/// A 256-bit vector of [4 x double] containing the right source operand.
571/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
572/// values of the second operand and the one's complement of the first
573/// operand.
574static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
575_mm256_andnot_pd(__m256d __a, __m256d __b)
576{
577 return (__m256d)(~(__v4du)__a & (__v4du)__b);
578}
579
580/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
581/// the one's complement of the values contained in the first source operand.
582///
583/// \headerfile <x86intrin.h>
584///
585/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
586///
587/// \param __a
588/// A 256-bit vector of [8 x float] containing the left source operand. The
589/// one's complement of this value is used in the bitwise AND.
590/// \param __b
591/// A 256-bit vector of [8 x float] containing the right source operand.
592/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
593/// values of the second operand and the one's complement of the first
594/// operand.
595static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
596_mm256_andnot_ps(__m256 __a, __m256 __b)
597{
598 return (__m256)(~(__v8su)__a & (__v8su)__b);
599}
600
601/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
602///
603/// \headerfile <x86intrin.h>
604///
605/// This intrinsic corresponds to the <c> VORPD </c> instruction.
606///
607/// \param __a
608/// A 256-bit vector of [4 x double] containing one of the source operands.
609/// \param __b
610/// A 256-bit vector of [4 x double] containing one of the source operands.
611/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
612/// values between both operands.
613static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
614_mm256_or_pd(__m256d __a, __m256d __b)
615{
616 return (__m256d)((__v4du)__a | (__v4du)__b);
617}
618
619/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
620///
621/// \headerfile <x86intrin.h>
622///
623/// This intrinsic corresponds to the <c> VORPS </c> instruction.
624///
625/// \param __a
626/// A 256-bit vector of [8 x float] containing one of the source operands.
627/// \param __b
628/// A 256-bit vector of [8 x float] containing one of the source operands.
629/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
630/// values between both operands.
631static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
632_mm256_or_ps(__m256 __a, __m256 __b)
633{
634 return (__m256)((__v8su)__a | (__v8su)__b);
635}
636
637/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
638///
639/// \headerfile <x86intrin.h>
640///
641/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
642///
643/// \param __a
644/// A 256-bit vector of [4 x double] containing one of the source operands.
645/// \param __b
646/// A 256-bit vector of [4 x double] containing one of the source operands.
647/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
648/// values between both operands.
649static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
650_mm256_xor_pd(__m256d __a, __m256d __b)
651{
652 return (__m256d)((__v4du)__a ^ (__v4du)__b);
653}
654
655/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
656///
657/// \headerfile <x86intrin.h>
658///
659/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
660///
661/// \param __a
662/// A 256-bit vector of [8 x float] containing one of the source operands.
663/// \param __b
664/// A 256-bit vector of [8 x float] containing one of the source operands.
665/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
666/// values between both operands.
667static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
668_mm256_xor_ps(__m256 __a, __m256 __b)
669{
670 return (__m256)((__v8su)__a ^ (__v8su)__b);
671}
672
673/* Horizontal arithmetic */
674/// Horizontally adds the adjacent pairs of values contained in two
675/// 256-bit vectors of [4 x double].
676///
677/// \headerfile <x86intrin.h>
678///
679/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
680///
681/// \param __a
682/// A 256-bit vector of [4 x double] containing one of the source operands.
683/// The horizontal sums of the values are returned in the even-indexed
684/// elements of a vector of [4 x double].
685/// \param __b
686/// A 256-bit vector of [4 x double] containing one of the source operands.
687/// The horizontal sums of the values are returned in the odd-indexed
688/// elements of a vector of [4 x double].
689/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
690/// both operands.
691static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
692_mm256_hadd_pd(__m256d __a, __m256d __b) {
693 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
694}
695
696/// Horizontally adds the adjacent pairs of values contained in two
697/// 256-bit vectors of [8 x float].
698///
699/// \headerfile <x86intrin.h>
700///
701/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
702///
703/// \param __a
704/// A 256-bit vector of [8 x float] containing one of the source operands.
705/// The horizontal sums of the values are returned in the elements with
706/// index 0, 1, 4, 5 of a vector of [8 x float].
707/// \param __b
708/// A 256-bit vector of [8 x float] containing one of the source operands.
709/// The horizontal sums of the values are returned in the elements with
710/// index 2, 3, 6, 7 of a vector of [8 x float].
711/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
712/// both operands.
713static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a,
714 __m256 __b) {
715 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
716}
717
718/// Horizontally subtracts the adjacent pairs of values contained in two
719/// 256-bit vectors of [4 x double].
720///
721/// \headerfile <x86intrin.h>
722///
723/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
724///
725/// \param __a
726/// A 256-bit vector of [4 x double] containing one of the source operands.
727/// The horizontal differences between the values are returned in the
728/// even-indexed elements of a vector of [4 x double].
729/// \param __b
730/// A 256-bit vector of [4 x double] containing one of the source operands.
731/// The horizontal differences between the values are returned in the
732/// odd-indexed elements of a vector of [4 x double].
733/// \returns A 256-bit vector of [4 x double] containing the horizontal
734/// differences of both operands.
735static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
736_mm256_hsub_pd(__m256d __a, __m256d __b) {
737 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
738}
739
740/// Horizontally subtracts the adjacent pairs of values contained in two
741/// 256-bit vectors of [8 x float].
742///
743/// \headerfile <x86intrin.h>
744///
745/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
746///
747/// \param __a
748/// A 256-bit vector of [8 x float] containing one of the source operands.
749/// The horizontal differences between the values are returned in the
750/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
751/// \param __b
752/// A 256-bit vector of [8 x float] containing one of the source operands.
753/// The horizontal differences between the values are returned in the
754/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
755/// \returns A 256-bit vector of [8 x float] containing the horizontal
756/// differences of both operands.
757static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a,
758 __m256 __b) {
759 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
760}
761
762/* Vector permutations */
763/// Copies the values in a 128-bit vector of [2 x double] as specified
764/// by the 128-bit integer vector operand.
765///
766/// \headerfile <x86intrin.h>
767///
768/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
769///
770/// \param __a
771/// A 128-bit vector of [2 x double].
772/// \param __c
773/// A 128-bit integer vector operand specifying how the values are to be
774/// copied. \n
775/// Bit [1]: \n
776/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
777/// vector. \n
778/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
779/// returned vector. \n
780/// Bit [65]: \n
781/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
782/// returned vector. \n
783/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
784/// returned vector.
785/// \returns A 128-bit vector of [2 x double] containing the copied values.
786static __inline __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
787_mm_permutevar_pd(__m128d __a, __m128i __c) {
788 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
789}
790
791/// Copies the values in a 256-bit vector of [4 x double] as specified
792/// by the 256-bit integer vector operand.
793///
794/// \headerfile <x86intrin.h>
795///
796/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
797///
798/// \param __a
799/// A 256-bit vector of [4 x double].
800/// \param __c
801/// A 256-bit integer vector operand specifying how the values are to be
802/// copied. \n
803/// Bit [1]: \n
804/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
805/// vector. \n
806/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
807/// returned vector. \n
808/// Bit [65]: \n
809/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
810/// returned vector. \n
811/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
812/// returned vector. \n
813/// Bit [129]: \n
814/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
815/// returned vector. \n
816/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
817/// returned vector. \n
818/// Bit [193]: \n
819/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
820/// returned vector. \n
821/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
822/// returned vector.
823/// \returns A 256-bit vector of [4 x double] containing the copied values.
824static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
825_mm256_permutevar_pd(__m256d __a, __m256i __c) {
826 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
827}
828
829/// Copies the values stored in a 128-bit vector of [4 x float] as
830/// specified by the 128-bit integer vector operand.
831///
832/// \headerfile <x86intrin.h>
833///
834/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
835///
836/// \param __a
837/// A 128-bit vector of [4 x float].
838/// \param __c
839/// A 128-bit integer vector operand specifying how the values are to be
840/// copied. \n
841/// Bits [1:0]: \n
842/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
843/// returned vector. \n
844/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
845/// returned vector. \n
846/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
847/// returned vector. \n
848/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
849/// returned vector. \n
850/// Bits [33:32]: \n
851/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
852/// returned vector. \n
853/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
854/// returned vector. \n
855/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
856/// returned vector. \n
857/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
858/// returned vector. \n
859/// Bits [65:64]: \n
860/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
861/// returned vector. \n
862/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
863/// returned vector. \n
864/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
865/// returned vector. \n
866/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
867/// returned vector. \n
868/// Bits [97:96]: \n
869/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
870/// returned vector. \n
871/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
872/// returned vector. \n
873/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
874/// returned vector. \n
875/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
876/// returned vector.
877/// \returns A 128-bit vector of [4 x float] containing the copied values.
878static __inline __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
879_mm_permutevar_ps(__m128 __a, __m128i __c) {
880 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
881}
882
883/// Copies the values stored in a 256-bit vector of [8 x float] as
884/// specified by the 256-bit integer vector operand.
885///
886/// \headerfile <x86intrin.h>
887///
888/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
889///
890/// \param __a
891/// A 256-bit vector of [8 x float].
892/// \param __c
893/// A 256-bit integer vector operand specifying how the values are to be
894/// copied. \n
895/// Bits [1:0]: \n
896/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
897/// returned vector. \n
898/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
899/// returned vector. \n
900/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
901/// returned vector. \n
902/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
903/// returned vector. \n
904/// Bits [33:32]: \n
905/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
906/// returned vector. \n
907/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
908/// returned vector. \n
909/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
910/// returned vector. \n
911/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
912/// returned vector. \n
913/// Bits [65:64]: \n
914/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
915/// returned vector. \n
916/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
917/// returned vector. \n
918/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
919/// returned vector. \n
920/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
921/// returned vector. \n
922/// Bits [97:96]: \n
923/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
924/// returned vector. \n
925/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
926/// returned vector. \n
927/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
928/// returned vector. \n
929/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
930/// returned vector. \n
931/// Bits [129:128]: \n
932/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
933/// returned vector. \n
934/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
935/// returned vector. \n
936/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
937/// returned vector. \n
938/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
939/// returned vector. \n
940/// Bits [161:160]: \n
941/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
942/// returned vector. \n
943/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
944/// returned vector. \n
945/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
946/// returned vector. \n
947/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
948/// returned vector. \n
949/// Bits [193:192]: \n
950/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
951/// returned vector. \n
952/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
953/// returned vector. \n
954/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
955/// returned vector. \n
956/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
957/// returned vector. \n
958/// Bits [225:224]: \n
959/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
960/// returned vector. \n
961/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
962/// returned vector. \n
963/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
964/// returned vector. \n
965/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
966/// returned vector.
967/// \returns A 256-bit vector of [8 x float] containing the copied values.
968static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
969_mm256_permutevar_ps(__m256 __a, __m256i __c) {
970 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
971}
972
973/// Copies the values in a 128-bit vector of [2 x double] as specified
974/// by the immediate integer operand.
975///
976/// \headerfile <x86intrin.h>
977///
978/// \code
979/// __m128d _mm_permute_pd(__m128d A, const int C);
980/// \endcode
981///
982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
983///
984/// \param A
985/// A 128-bit vector of [2 x double].
986/// \param C
987/// An immediate integer operand specifying how the values are to be
988/// copied. \n
989/// Bit [0]: \n
990/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
991/// vector. \n
992/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
993/// returned vector. \n
994/// Bit [1]: \n
995/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
996/// returned vector. \n
997/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
998/// returned vector.
999/// \returns A 128-bit vector of [2 x double] containing the copied values.
1000#define _mm_permute_pd(A, C) \
1001 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1002
1003/// Copies the values in a 256-bit vector of [4 x double] as specified by
1004/// the immediate integer operand.
1005///
1006/// \headerfile <x86intrin.h>
1007///
1008/// \code
1009/// __m256d _mm256_permute_pd(__m256d A, const int C);
1010/// \endcode
1011///
1012/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1013///
1014/// \param A
1015/// A 256-bit vector of [4 x double].
1016/// \param C
1017/// An immediate integer operand specifying how the values are to be
1018/// copied. \n
1019/// Bit [0]: \n
1020/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1021/// vector. \n
1022/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1023/// returned vector. \n
1024/// Bit [1]: \n
1025/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1026/// returned vector. \n
1027/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1028/// returned vector. \n
1029/// Bit [2]: \n
1030/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1031/// returned vector. \n
1032/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1033/// returned vector. \n
1034/// Bit [3]: \n
1035/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1036/// returned vector. \n
1037/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1038/// returned vector.
1039/// \returns A 256-bit vector of [4 x double] containing the copied values.
1040#define _mm256_permute_pd(A, C) \
1041 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1042
1043/// Copies the values in a 128-bit vector of [4 x float] as specified by
1044/// the immediate integer operand.
1045///
1046/// \headerfile <x86intrin.h>
1047///
1048/// \code
1049/// __m128 _mm_permute_ps(__m128 A, const int C);
1050/// \endcode
1051///
1052/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1053///
1054/// \param A
1055/// A 128-bit vector of [4 x float].
1056/// \param C
1057/// An immediate integer operand specifying how the values are to be
1058/// copied. \n
1059/// Bits [1:0]: \n
1060/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1061/// returned vector. \n
1062/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1063/// returned vector. \n
1064/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1065/// returned vector. \n
1066/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1067/// returned vector. \n
1068/// Bits [3:2]: \n
1069/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1070/// returned vector. \n
1071/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1072/// returned vector. \n
1073/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1074/// returned vector. \n
1075/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1076/// returned vector. \n
1077/// Bits [5:4]: \n
1078/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1079/// returned vector. \n
1080/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1081/// returned vector. \n
1082/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1083/// returned vector. \n
1084/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1085/// returned vector. \n
1086/// Bits [7:6]: \n
1087/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1088/// returned vector. \n
1089/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1090/// returned vector. \n
1091/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1092/// returned vector. \n
1093/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1094/// returned vector.
1095/// \returns A 128-bit vector of [4 x float] containing the copied values.
1096#define _mm_permute_ps(A, C) \
1097 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1098
1099/// Copies the values in a 256-bit vector of [8 x float] as specified by
1100/// the immediate integer operand.
1101///
1102/// \headerfile <x86intrin.h>
1103///
1104/// \code
1105/// __m256 _mm256_permute_ps(__m256 A, const int C);
1106/// \endcode
1107///
1108/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1109///
1110/// \param A
1111/// A 256-bit vector of [8 x float].
1112/// \param C
1113/// An immediate integer operand specifying how the values are to be
1114/// copied. \n
1115/// Bits [1:0]: \n
1116/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1117/// returned vector. \n
1118/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1119/// returned vector. \n
1120/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1121/// returned vector. \n
1122/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1123/// returned vector. \n
1124/// Bits [3:2]: \n
1125/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1126/// returned vector. \n
1127/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1128/// returned vector. \n
1129/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1130/// returned vector. \n
1131/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1132/// returned vector. \n
1133/// Bits [5:4]: \n
1134/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1135/// returned vector. \n
1136/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1137/// returned vector. \n
1138/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1139/// returned vector. \n
1140/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1141/// returned vector. \n
1142/// Bits [7:6]: \n
1143/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1144/// returned vector. \n
1145/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1146/// returned vector. \n
1147/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1148/// returned vector. \n
1149/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1150/// returned vector. \n
1151/// Bits [1:0]: \n
1152/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1153/// returned vector. \n
1154/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1155/// returned vector. \n
1156/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1157/// returned vector. \n
1158/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1159/// returned vector. \n
1160/// Bits [3:2]: \n
1161/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1162/// returned vector. \n
1163/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1164/// returned vector. \n
1165/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1166/// returned vector. \n
1167/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1168/// returned vector. \n
1169/// Bits [5:4]: \n
1170/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1171/// returned vector. \n
1172/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1173/// returned vector. \n
1174/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1175/// returned vector. \n
1176/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1177/// returned vector. \n
1178/// Bits [7:6]: \n
1179/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1180/// returned vector. \n
1181/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1182/// returned vector. \n
1183/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1184/// returned vector. \n
1185/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1186/// returned vector.
1187/// \returns A 256-bit vector of [8 x float] containing the copied values.
1188#define _mm256_permute_ps(A, C) \
1189 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1190
1191/// Permutes 128-bit data values stored in two 256-bit vectors of
1192/// [4 x double], as specified by the immediate integer operand.
1193///
1194/// \headerfile <x86intrin.h>
1195///
1196/// \code
1197/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1198/// \endcode
1199///
1200/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1201///
1202/// \param V1
1203/// A 256-bit vector of [4 x double].
1204/// \param V2
1205/// A 256-bit vector of [4 x double.
1206/// \param M
1207/// An immediate integer operand specifying how the values are to be
1208/// permuted. \n
1209/// Bits [1:0]: \n
1210/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1211/// destination. \n
1212/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1213/// destination. \n
1214/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1215/// destination. \n
1216/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1217/// destination. \n
1218/// Bits [5:4]: \n
1219/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1220/// destination. \n
1221/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1222/// destination. \n
1223/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1224/// destination. \n
1225/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1226/// destination.
1227/// \returns A 256-bit vector of [4 x double] containing the copied values.
1228#define _mm256_permute2f128_pd(V1, V2, M) \
1229 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1230 (__v4df)(__m256d)(V2), (int)(M)))
1231
1232/// Permutes 128-bit data values stored in two 256-bit vectors of
1233/// [8 x float], as specified by the immediate integer operand.
1234///
1235/// \headerfile <x86intrin.h>
1236///
1237/// \code
1238/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1239/// \endcode
1240///
1241/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1242///
1243/// \param V1
1244/// A 256-bit vector of [8 x float].
1245/// \param V2
1246/// A 256-bit vector of [8 x float].
1247/// \param M
1248/// An immediate integer operand specifying how the values are to be
1249/// permuted. \n
1250/// Bits [1:0]: \n
1251/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1252/// destination. \n
1253/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1254/// destination. \n
1255/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1256/// destination. \n
1257/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1258/// destination. \n
1259/// Bits [5:4]: \n
1260/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1261/// destination. \n
1262/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1263/// destination. \n
1264/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1265/// destination. \n
1266/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1267/// destination.
1268/// \returns A 256-bit vector of [8 x float] containing the copied values.
1269#define _mm256_permute2f128_ps(V1, V2, M) \
1270 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1271 (__v8sf)(__m256)(V2), (int)(M)))
1272
1273/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1274/// as specified by the immediate integer operand.
1275///
1276/// \headerfile <x86intrin.h>
1277///
1278/// \code
1279/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1280/// \endcode
1281///
1282/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1283///
1284/// \param V1
1285/// A 256-bit integer vector.
1286/// \param V2
1287/// A 256-bit integer vector.
1288/// \param M
1289/// An immediate integer operand specifying how the values are to be copied.
1290/// Bits [1:0]: \n
1291/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1292/// destination. \n
1293/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1294/// destination. \n
1295/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1296/// destination. \n
1297/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1298/// destination. \n
1299/// Bits [5:4]: \n
1300/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1301/// destination. \n
1302/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1303/// destination. \n
1304/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1305/// destination. \n
1306/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1307/// destination.
1308/// \returns A 256-bit integer vector containing the copied values.
1309#define _mm256_permute2f128_si256(V1, V2, M) \
1310 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1311 (__v8si)(__m256i)(V2), (int)(M)))
1312
1313/* Vector Blend */
1314/// Merges 64-bit double-precision data values stored in either of the
1315/// two 256-bit vectors of [4 x double], as specified by the immediate
1316/// integer operand.
1317///
1318/// \headerfile <x86intrin.h>
1319///
1320/// \code
1321/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1322/// \endcode
1323///
1324/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1325///
1326/// \param V1
1327/// A 256-bit vector of [4 x double].
1328/// \param V2
1329/// A 256-bit vector of [4 x double].
1330/// \param M
1331/// An immediate integer operand, with mask bits [3:0] specifying how the
1332/// values are to be copied. The position of the mask bit corresponds to the
1333/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1334/// element in operand \a V1 is copied to the same position in the
1335/// destination. When a mask bit is 1, the corresponding 64-bit element in
1336/// operand \a V2 is copied to the same position in the destination.
1337/// \returns A 256-bit vector of [4 x double] containing the copied values.
1338#define _mm256_blend_pd(V1, V2, M) \
1339 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1340 (__v4df)(__m256d)(V2), (int)(M)))
1341
1342/// Merges 32-bit single-precision data values stored in either of the
1343/// two 256-bit vectors of [8 x float], as specified by the immediate
1344/// integer operand.
1345///
1346/// \headerfile <x86intrin.h>
1347///
1348/// \code
1349/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1350/// \endcode
1351///
1352/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1353///
1354/// \param V1
1355/// A 256-bit vector of [8 x float].
1356/// \param V2
1357/// A 256-bit vector of [8 x float].
1358/// \param M
1359/// An immediate integer operand, with mask bits [7:0] specifying how the
1360/// values are to be copied. The position of the mask bit corresponds to the
1361/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1362/// element in operand \a V1 is copied to the same position in the
1363/// destination. When a mask bit is 1, the corresponding 32-bit element in
1364/// operand \a V2 is copied to the same position in the destination.
1365/// \returns A 256-bit vector of [8 x float] containing the copied values.
1366#define _mm256_blend_ps(V1, V2, M) \
1367 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1368 (__v8sf)(__m256)(V2), (int)(M)))
1369
1370/// Merges 64-bit double-precision data values stored in either of the
1371/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1372/// operand.
1373///
1374/// \headerfile <x86intrin.h>
1375///
1376/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1377///
1378/// \param __a
1379/// A 256-bit vector of [4 x double].
1380/// \param __b
1381/// A 256-bit vector of [4 x double].
1382/// \param __c
1383/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1384/// how the values are to be copied. The position of the mask bit corresponds
1385/// to the most significant bit of a copied value. When a mask bit is 0, the
1386/// corresponding 64-bit element in operand \a __a is copied to the same
1387/// position in the destination. When a mask bit is 1, the corresponding
1388/// 64-bit element in operand \a __b is copied to the same position in the
1389/// destination.
1390/// \returns A 256-bit vector of [4 x double] containing the copied values.
1391static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
1392_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {
1393 return (__m256d)__builtin_ia32_blendvpd256(
1394 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1395}
1396
1397/// Merges 32-bit single-precision data values stored in either of the
1398/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1399/// operand.
1400///
1401/// \headerfile <x86intrin.h>
1402///
1403/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1404///
1405/// \param __a
1406/// A 256-bit vector of [8 x float].
1407/// \param __b
1408/// A 256-bit vector of [8 x float].
1409/// \param __c
1410/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1411/// and 31 specifying how the values are to be copied. The position of the
1412/// mask bit corresponds to the most significant bit of a copied value. When
1413/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1414/// copied to the same position in the destination. When a mask bit is 1, the
1415/// corresponding 32-bit element in operand \a __b is copied to the same
1416/// position in the destination.
1417/// \returns A 256-bit vector of [8 x float] containing the copied values.
1418static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
1419_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {
1420 return (__m256)__builtin_ia32_blendvps256(
1421 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1422}
1423
1424/* Vector Dot Product */
1425/// Computes two dot products in parallel, using the lower and upper
1426/// halves of two [8 x float] vectors as input to the two computations, and
1427/// returning the two dot products in the lower and upper halves of the
1428/// [8 x float] result.
1429///
1430/// The immediate integer operand controls which input elements will
1431/// contribute to the dot product, and where the final results are returned.
1432/// In general, for each dot product, the four corresponding elements of the
1433/// input vectors are multiplied; the first two and second two products are
1434/// summed, then the two sums are added to form the final result.
1435///
1436/// \headerfile <x86intrin.h>
1437///
1438/// \code
1439/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1440/// \endcode
1441///
1442/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1443///
1444/// \param V1
1445/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1446/// \param V2
1447/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1448/// \param M
1449/// An immediate integer argument. Bits [7:4] determine which elements of
1450/// the input vectors are used, with bit [4] corresponding to the lowest
1451/// element and bit [7] corresponding to the highest element of each [4 x
1452/// float] subvector. If a bit is set, the corresponding elements from the
1453/// two input vectors are used as an input for dot product; otherwise that
1454/// input is treated as zero. Bits [3:0] determine which elements of the
1455/// result will receive a copy of the final dot product, with bit [0]
1456/// corresponding to the lowest element and bit [3] corresponding to the
1457/// highest element of each [4 x float] subvector. If a bit is set, the dot
1458/// product is returned in the corresponding element; otherwise that element
1459/// is set to zero. The bitmask is applied in the same way to each of the
1460/// two parallel dot product computations.
1461/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1462#define _mm256_dp_ps(V1, V2, M) \
1463 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1464 (__v8sf)(__m256)(V2), (M)))
1465
1466/* Vector shuffle */
1467/// Selects 8 float values from the 256-bit operands of [8 x float], as
1468/// specified by the immediate value operand.
1469///
1470/// The four selected elements in each operand are copied to the destination
1471/// according to the bits specified in the immediate operand. The selected
1472/// elements from the first 256-bit operand are copied to bits [63:0] and
1473/// bits [191:128] of the destination, and the selected elements from the
1474/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1475/// the destination. For example, if bits [7:0] of the immediate operand
1476/// contain a value of 0xFF, the 256-bit destination vector would contain the
1477/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1478///
1479/// \headerfile <x86intrin.h>
1480///
1481/// \code
1482/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1483/// \endcode
1484///
1485/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1486///
1487/// \param a
1488/// A 256-bit vector of [8 x float]. The four selected elements in this
1489/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1490/// according to the bits specified in the immediate operand.
1491/// \param b
1492/// A 256-bit vector of [8 x float]. The four selected elements in this
1493/// operand are copied to bits [127:64] and bits [255:192] in the
1494/// destination, according to the bits specified in the immediate operand.
1495/// \param mask
1496/// An immediate value containing an 8-bit value specifying which elements to
1497/// copy from \a a and \a b \n.
1498/// Bits [3:0] specify the values copied from operand \a a. \n
1499/// Bits [7:4] specify the values copied from operand \a b. \n
1500/// The destinations within the 256-bit destination are assigned values as
1501/// follows, according to the bit value assignments described below: \n
1502/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1503/// destination. \n
1504/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1505/// destination. \n
1506/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1507/// destination. \n
1508/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1509/// the destination. \n
1510/// Bit value assignments: \n
1511/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1512/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1513/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1514/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1515/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1516/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1517/// <c>[b6, b4, b2, b0]</c>.
1518/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1519#define _mm256_shuffle_ps(a, b, mask) \
1520 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1521 (__v8sf)(__m256)(b), (int)(mask)))
1522
1523/// Selects four double-precision values from the 256-bit operands of
1524/// [4 x double], as specified by the immediate value operand.
1525///
1526/// The selected elements from the first 256-bit operand are copied to bits
1527/// [63:0] and bits [191:128] in the destination, and the selected elements
1528/// from the second 256-bit operand are copied to bits [127:64] and bits
1529/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1530/// operand contain a value of 0xF, the 256-bit destination vector would
1531/// contain the following values: b[3], a[3], b[1], a[1].
1532///
1533/// \headerfile <x86intrin.h>
1534///
1535/// \code
1536/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1537/// \endcode
1538///
1539/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1540///
1541/// \param a
1542/// A 256-bit vector of [4 x double].
1543/// \param b
1544/// A 256-bit vector of [4 x double].
1545/// \param mask
1546/// An immediate value containing 8-bit values specifying which elements to
1547/// copy from \a a and \a b: \n
1548/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1549/// destination. \n
1550/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1551/// destination. \n
1552/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1553/// destination. \n
1554/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1555/// destination. \n
1556/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1557/// destination. \n
1558/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1559/// destination. \n
1560/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1561/// destination. \n
1562/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1563/// destination.
1564/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1565#define _mm256_shuffle_pd(a, b, mask) \
1566 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1567 (__v4df)(__m256d)(b), (int)(mask)))
1568
1569/* Compare */
1570#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1571#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1572#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1573#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1574#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1575#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1576#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1577#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1578#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1579#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1580#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1581#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1582#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1583#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1584#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1585#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1586#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1587#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1588#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1589#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1590#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1591#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1592#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1593#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1594
1595/* Below intrinsic defined in emmintrin.h can be used for AVX */
1596/// Compares each of the corresponding double-precision values of two
1597/// 128-bit vectors of [2 x double], using the operation specified by the
1598/// immediate integer operand.
1599///
1600/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1601/// If either value in a comparison is NaN, comparisons that are ordered
1602/// return false, and comparisons that are unordered return true.
1603///
1604/// \headerfile <x86intrin.h>
1605///
1606/// \code
1607/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1608/// \endcode
1609///
1610/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1611///
1612/// \param a
1613/// A 128-bit vector of [2 x double].
1614/// \param b
1615/// A 128-bit vector of [2 x double].
1616/// \param c
1617/// An immediate integer operand, with bits [4:0] specifying which comparison
1618/// operation to use: \n
1619/// 0x00: Equal (ordered, non-signaling) \n
1620/// 0x01: Less-than (ordered, signaling) \n
1621/// 0x02: Less-than-or-equal (ordered, signaling) \n
1622/// 0x03: Unordered (non-signaling) \n
1623/// 0x04: Not-equal (unordered, non-signaling) \n
1624/// 0x05: Not-less-than (unordered, signaling) \n
1625/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1626/// 0x07: Ordered (non-signaling) \n
1627/// 0x08: Equal (unordered, non-signaling) \n
1628/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1629/// 0x0A: Not-greater-than (unordered, signaling) \n
1630/// 0x0B: False (ordered, non-signaling) \n
1631/// 0x0C: Not-equal (ordered, non-signaling) \n
1632/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1633/// 0x0E: Greater-than (ordered, signaling) \n
1634/// 0x0F: True (unordered, non-signaling) \n
1635/// 0x10: Equal (ordered, signaling) \n
1636/// 0x11: Less-than (ordered, non-signaling) \n
1637/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1638/// 0x13: Unordered (signaling) \n
1639/// 0x14: Not-equal (unordered, signaling) \n
1640/// 0x15: Not-less-than (unordered, non-signaling) \n
1641/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1642/// 0x17: Ordered (signaling) \n
1643/// 0x18: Equal (unordered, signaling) \n
1644/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1645/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1646/// 0x1B: False (ordered, signaling) \n
1647/// 0x1C: Not-equal (ordered, signaling) \n
1648/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1649/// 0x1E: Greater-than (ordered, non-signaling) \n
1650/// 0x1F: True (unordered, signaling)
1651/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1652/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1653
1654/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1655/// Compares each of the corresponding values of two 128-bit vectors of
1656/// [4 x float], using the operation specified by the immediate integer
1657/// operand.
1658///
1659/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1660/// If either value in a comparison is NaN, comparisons that are ordered
1661/// return false, and comparisons that are unordered return true.
1662///
1663/// \headerfile <x86intrin.h>
1664///
1665/// \code
1666/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1667/// \endcode
1668///
1669/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1670///
1671/// \param a
1672/// A 128-bit vector of [4 x float].
1673/// \param b
1674/// A 128-bit vector of [4 x float].
1675/// \param c
1676/// An immediate integer operand, with bits [4:0] specifying which comparison
1677/// operation to use: \n
1678/// 0x00: Equal (ordered, non-signaling) \n
1679/// 0x01: Less-than (ordered, signaling) \n
1680/// 0x02: Less-than-or-equal (ordered, signaling) \n
1681/// 0x03: Unordered (non-signaling) \n
1682/// 0x04: Not-equal (unordered, non-signaling) \n
1683/// 0x05: Not-less-than (unordered, signaling) \n
1684/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1685/// 0x07: Ordered (non-signaling) \n
1686/// 0x08: Equal (unordered, non-signaling) \n
1687/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1688/// 0x0A: Not-greater-than (unordered, signaling) \n
1689/// 0x0B: False (ordered, non-signaling) \n
1690/// 0x0C: Not-equal (ordered, non-signaling) \n
1691/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1692/// 0x0E: Greater-than (ordered, signaling) \n
1693/// 0x0F: True (unordered, non-signaling) \n
1694/// 0x10: Equal (ordered, signaling) \n
1695/// 0x11: Less-than (ordered, non-signaling) \n
1696/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1697/// 0x13: Unordered (signaling) \n
1698/// 0x14: Not-equal (unordered, signaling) \n
1699/// 0x15: Not-less-than (unordered, non-signaling) \n
1700/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1701/// 0x17: Ordered (signaling) \n
1702/// 0x18: Equal (unordered, signaling) \n
1703/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1704/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1705/// 0x1B: False (ordered, signaling) \n
1706/// 0x1C: Not-equal (ordered, signaling) \n
1707/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1708/// 0x1E: Greater-than (ordered, non-signaling) \n
1709/// 0x1F: True (unordered, signaling)
1710/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1711/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1712
1713/// Compares each of the corresponding double-precision values of two
1714/// 256-bit vectors of [4 x double], using the operation specified by the
1715/// immediate integer operand.
1716///
1717/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1718/// If either value in a comparison is NaN, comparisons that are ordered
1719/// return false, and comparisons that are unordered return true.
1720///
1721/// \headerfile <x86intrin.h>
1722///
1723/// \code
1724/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1725/// \endcode
1726///
1727/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1728///
1729/// \param a
1730/// A 256-bit vector of [4 x double].
1731/// \param b
1732/// A 256-bit vector of [4 x double].
1733/// \param c
1734/// An immediate integer operand, with bits [4:0] specifying which comparison
1735/// operation to use: \n
1736/// 0x00: Equal (ordered, non-signaling) \n
1737/// 0x01: Less-than (ordered, signaling) \n
1738/// 0x02: Less-than-or-equal (ordered, signaling) \n
1739/// 0x03: Unordered (non-signaling) \n
1740/// 0x04: Not-equal (unordered, non-signaling) \n
1741/// 0x05: Not-less-than (unordered, signaling) \n
1742/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1743/// 0x07: Ordered (non-signaling) \n
1744/// 0x08: Equal (unordered, non-signaling) \n
1745/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1746/// 0x0A: Not-greater-than (unordered, signaling) \n
1747/// 0x0B: False (ordered, non-signaling) \n
1748/// 0x0C: Not-equal (ordered, non-signaling) \n
1749/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1750/// 0x0E: Greater-than (ordered, signaling) \n
1751/// 0x0F: True (unordered, non-signaling) \n
1752/// 0x10: Equal (ordered, signaling) \n
1753/// 0x11: Less-than (ordered, non-signaling) \n
1754/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1755/// 0x13: Unordered (signaling) \n
1756/// 0x14: Not-equal (unordered, signaling) \n
1757/// 0x15: Not-less-than (unordered, non-signaling) \n
1758/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1759/// 0x17: Ordered (signaling) \n
1760/// 0x18: Equal (unordered, signaling) \n
1761/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1762/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1763/// 0x1B: False (ordered, signaling) \n
1764/// 0x1C: Not-equal (ordered, signaling) \n
1765/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1766/// 0x1E: Greater-than (ordered, non-signaling) \n
1767/// 0x1F: True (unordered, signaling)
1768/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1769#define _mm256_cmp_pd(a, b, c) \
1770 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1771 (__v4df)(__m256d)(b), (c)))
1772
1773/// Compares each of the corresponding values of two 256-bit vectors of
1774/// [8 x float], using the operation specified by the immediate integer
1775/// operand.
1776///
1777/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1778/// If either value in a comparison is NaN, comparisons that are ordered
1779/// return false, and comparisons that are unordered return true.
1780///
1781/// \headerfile <x86intrin.h>
1782///
1783/// \code
1784/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1785/// \endcode
1786///
1787/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1788///
1789/// \param a
1790/// A 256-bit vector of [8 x float].
1791/// \param b
1792/// A 256-bit vector of [8 x float].
1793/// \param c
1794/// An immediate integer operand, with bits [4:0] specifying which comparison
1795/// operation to use: \n
1796/// 0x00: Equal (ordered, non-signaling) \n
1797/// 0x01: Less-than (ordered, signaling) \n
1798/// 0x02: Less-than-or-equal (ordered, signaling) \n
1799/// 0x03: Unordered (non-signaling) \n
1800/// 0x04: Not-equal (unordered, non-signaling) \n
1801/// 0x05: Not-less-than (unordered, signaling) \n
1802/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1803/// 0x07: Ordered (non-signaling) \n
1804/// 0x08: Equal (unordered, non-signaling) \n
1805/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1806/// 0x0A: Not-greater-than (unordered, signaling) \n
1807/// 0x0B: False (ordered, non-signaling) \n
1808/// 0x0C: Not-equal (ordered, non-signaling) \n
1809/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1810/// 0x0E: Greater-than (ordered, signaling) \n
1811/// 0x0F: True (unordered, non-signaling) \n
1812/// 0x10: Equal (ordered, signaling) \n
1813/// 0x11: Less-than (ordered, non-signaling) \n
1814/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1815/// 0x13: Unordered (signaling) \n
1816/// 0x14: Not-equal (unordered, signaling) \n
1817/// 0x15: Not-less-than (unordered, non-signaling) \n
1818/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1819/// 0x17: Ordered (signaling) \n
1820/// 0x18: Equal (unordered, signaling) \n
1821/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1822/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1823/// 0x1B: False (ordered, signaling) \n
1824/// 0x1C: Not-equal (ordered, signaling) \n
1825/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1826/// 0x1E: Greater-than (ordered, non-signaling) \n
1827/// 0x1F: True (unordered, signaling)
1828/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1829#define _mm256_cmp_ps(a, b, c) \
1830 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1831 (__v8sf)(__m256)(b), (c)))
1832
1833/* Below intrinsic defined in emmintrin.h can be used for AVX */
1834/// Compares each of the corresponding scalar double-precision values of
1835/// two 128-bit vectors of [2 x double], using the operation specified by the
1836/// immediate integer operand.
1837///
1838/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1839/// If either value in a comparison is NaN, comparisons that are ordered
1840/// return false, and comparisons that are unordered return true.
1841///
1842/// \headerfile <x86intrin.h>
1843///
1844/// \code
1845/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1846/// \endcode
1847///
1848/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1849///
1850/// \param a
1851/// A 128-bit vector of [2 x double].
1852/// \param b
1853/// A 128-bit vector of [2 x double].
1854/// \param c
1855/// An immediate integer operand, with bits [4:0] specifying which comparison
1856/// operation to use: \n
1857/// 0x00: Equal (ordered, non-signaling) \n
1858/// 0x01: Less-than (ordered, signaling) \n
1859/// 0x02: Less-than-or-equal (ordered, signaling) \n
1860/// 0x03: Unordered (non-signaling) \n
1861/// 0x04: Not-equal (unordered, non-signaling) \n
1862/// 0x05: Not-less-than (unordered, signaling) \n
1863/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1864/// 0x07: Ordered (non-signaling) \n
1865/// 0x08: Equal (unordered, non-signaling) \n
1866/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1867/// 0x0A: Not-greater-than (unordered, signaling) \n
1868/// 0x0B: False (ordered, non-signaling) \n
1869/// 0x0C: Not-equal (ordered, non-signaling) \n
1870/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1871/// 0x0E: Greater-than (ordered, signaling) \n
1872/// 0x0F: True (unordered, non-signaling) \n
1873/// 0x10: Equal (ordered, signaling) \n
1874/// 0x11: Less-than (ordered, non-signaling) \n
1875/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1876/// 0x13: Unordered (signaling) \n
1877/// 0x14: Not-equal (unordered, signaling) \n
1878/// 0x15: Not-less-than (unordered, non-signaling) \n
1879/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1880/// 0x17: Ordered (signaling) \n
1881/// 0x18: Equal (unordered, signaling) \n
1882/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1883/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1884/// 0x1B: False (ordered, signaling) \n
1885/// 0x1C: Not-equal (ordered, signaling) \n
1886/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1887/// 0x1E: Greater-than (ordered, non-signaling) \n
1888/// 0x1F: True (unordered, signaling)
1889/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1890/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1891
1892/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1893/// Compares each of the corresponding scalar values of two 128-bit
1894/// vectors of [4 x float], using the operation specified by the immediate
1895/// integer operand.
1896///
1897/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1898/// If either value in a comparison is NaN, comparisons that are ordered
1899/// return false, and comparisons that are unordered return true.
1900///
1901/// \headerfile <x86intrin.h>
1902///
1903/// \code
1904/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1905/// \endcode
1906///
1907/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1908///
1909/// \param a
1910/// A 128-bit vector of [4 x float].
1911/// \param b
1912/// A 128-bit vector of [4 x float].
1913/// \param c
1914/// An immediate integer operand, with bits [4:0] specifying which comparison
1915/// operation to use: \n
1916/// 0x00: Equal (ordered, non-signaling) \n
1917/// 0x01: Less-than (ordered, signaling) \n
1918/// 0x02: Less-than-or-equal (ordered, signaling) \n
1919/// 0x03: Unordered (non-signaling) \n
1920/// 0x04: Not-equal (unordered, non-signaling) \n
1921/// 0x05: Not-less-than (unordered, signaling) \n
1922/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1923/// 0x07: Ordered (non-signaling) \n
1924/// 0x08: Equal (unordered, non-signaling) \n
1925/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1926/// 0x0A: Not-greater-than (unordered, signaling) \n
1927/// 0x0B: False (ordered, non-signaling) \n
1928/// 0x0C: Not-equal (ordered, non-signaling) \n
1929/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1930/// 0x0E: Greater-than (ordered, signaling) \n
1931/// 0x0F: True (unordered, non-signaling) \n
1932/// 0x10: Equal (ordered, signaling) \n
1933/// 0x11: Less-than (ordered, non-signaling) \n
1934/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1935/// 0x13: Unordered (signaling) \n
1936/// 0x14: Not-equal (unordered, signaling) \n
1937/// 0x15: Not-less-than (unordered, non-signaling) \n
1938/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1939/// 0x17: Ordered (signaling) \n
1940/// 0x18: Equal (unordered, signaling) \n
1941/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1942/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1943/// 0x1B: False (ordered, signaling) \n
1944/// 0x1C: Not-equal (ordered, signaling) \n
1945/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1946/// 0x1E: Greater-than (ordered, non-signaling) \n
1947/// 0x1F: True (unordered, signaling)
1948/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1949/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1950
1951/// Takes a [8 x i32] vector and returns the vector element value
1952/// indexed by the immediate constant operand.
1953///
1954/// \headerfile <x86intrin.h>
1955///
1956/// \code
1957/// int _mm256_extract_epi32(__m256i X, const int N);
1958/// \endcode
1959///
1960/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1961/// instruction.
1962///
1963/// \param X
1964/// A 256-bit vector of [8 x i32].
1965/// \param N
1966/// An immediate integer operand with bits [2:0] determining which vector
1967/// element is extracted and returned.
1968/// \returns A 32-bit integer containing the extracted 32 bits of extended
1969/// packed data.
1970#define _mm256_extract_epi32(X, N) \
1971 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1972
1973/// Takes a [16 x i16] vector and returns the vector element value
1974/// indexed by the immediate constant operand.
1975///
1976/// \headerfile <x86intrin.h>
1977///
1978/// \code
1979/// int _mm256_extract_epi16(__m256i X, const int N);
1980/// \endcode
1981///
1982/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1983/// instruction.
1984///
1985/// \param X
1986/// A 256-bit integer vector of [16 x i16].
1987/// \param N
1988/// An immediate integer operand with bits [3:0] determining which vector
1989/// element is extracted and returned.
1990/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
1991/// packed data.
1992#define _mm256_extract_epi16(X, N) \
1993 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
1994 (int)(N)))
1995
1996/// Takes a [32 x i8] vector and returns the vector element value
1997/// indexed by the immediate constant operand.
1998///
1999/// \headerfile <x86intrin.h>
2000///
2001/// \code
2002/// int _mm256_extract_epi8(__m256i X, const int N);
2003/// \endcode
2004///
2005/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2006/// instruction.
2007///
2008/// \param X
2009/// A 256-bit integer vector of [32 x i8].
2010/// \param N
2011/// An immediate integer operand with bits [4:0] determining which vector
2012/// element is extracted and returned.
2013/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2014/// packed data.
2015#define _mm256_extract_epi8(X, N) \
2016 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2017 (int)(N)))
2018
2019#ifdef __x86_64__
2020/// Takes a [4 x i64] vector and returns the vector element value
2021/// indexed by the immediate constant operand.
2022///
2023/// \headerfile <x86intrin.h>
2024///
2025/// \code
2026/// long long _mm256_extract_epi64(__m256i X, const int N);
2027/// \endcode
2028///
2029/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2030/// instruction.
2031///
2032/// \param X
2033/// A 256-bit integer vector of [4 x i64].
2034/// \param N
2035/// An immediate integer operand with bits [1:0] determining which vector
2036/// element is extracted and returned.
2037/// \returns A 64-bit integer containing the extracted 64 bits of extended
2038/// packed data.
2039#define _mm256_extract_epi64(X, N) \
2040 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2041#endif
2042
2043/// Takes a [8 x i32] vector and replaces the vector element value
2044/// indexed by the immediate constant operand by a new value. Returns the
2045/// modified vector.
2046///
2047/// \headerfile <x86intrin.h>
2048///
2049/// \code
2050/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2051/// \endcode
2052///
2053/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2054/// instruction.
2055///
2056/// \param X
2057/// A vector of [8 x i32] to be used by the insert operation.
2058/// \param I
2059/// An integer value. The replacement value for the insert operation.
2060/// \param N
2061/// An immediate integer specifying the index of the vector element to be
2062/// replaced.
2063/// \returns A copy of vector \a X, after replacing its element indexed by
2064/// \a N with \a I.
2065#define _mm256_insert_epi32(X, I, N) \
2066 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2067 (int)(I), (int)(N)))
2068
2069
2070/// Takes a [16 x i16] vector and replaces the vector element value
2071/// indexed by the immediate constant operand with a new value. Returns the
2072/// modified vector.
2073///
2074/// \headerfile <x86intrin.h>
2075///
2076/// \code
2077/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2078/// \endcode
2079///
2080/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2081/// instruction.
2082///
2083/// \param X
2084/// A vector of [16 x i16] to be used by the insert operation.
2085/// \param I
2086/// An i16 integer value. The replacement value for the insert operation.
2087/// \param N
2088/// An immediate integer specifying the index of the vector element to be
2089/// replaced.
2090/// \returns A copy of vector \a X, after replacing its element indexed by
2091/// \a N with \a I.
2092#define _mm256_insert_epi16(X, I, N) \
2093 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2094 (int)(I), (int)(N)))
2095
2096/// Takes a [32 x i8] vector and replaces the vector element value
2097/// indexed by the immediate constant operand with a new value. Returns the
2098/// modified vector.
2099///
2100/// \headerfile <x86intrin.h>
2101///
2102/// \code
2103/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2104/// \endcode
2105///
2106/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2107/// instruction.
2108///
2109/// \param X
2110/// A vector of [32 x i8] to be used by the insert operation.
2111/// \param I
2112/// An i8 integer value. The replacement value for the insert operation.
2113/// \param N
2114/// An immediate integer specifying the index of the vector element to be
2115/// replaced.
2116/// \returns A copy of vector \a X, after replacing its element indexed by
2117/// \a N with \a I.
2118#define _mm256_insert_epi8(X, I, N) \
2119 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2120 (int)(I), (int)(N)))
2121
2122#ifdef __x86_64__
2123/// Takes a [4 x i64] vector and replaces the vector element value
2124/// indexed by the immediate constant operand with a new value. Returns the
2125/// modified vector.
2126///
2127/// \headerfile <x86intrin.h>
2128///
2129/// \code
2130/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2131/// \endcode
2132///
2133/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2134/// instruction.
2135///
2136/// \param X
2137/// A vector of [4 x i64] to be used by the insert operation.
2138/// \param I
2139/// A 64-bit integer value. The replacement value for the insert operation.
2140/// \param N
2141/// An immediate integer specifying the index of the vector element to be
2142/// replaced.
2143/// \returns A copy of vector \a X, after replacing its element indexed by
2144/// \a N with \a I.
2145#define _mm256_insert_epi64(X, I, N) \
2146 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2147 (long long)(I), (int)(N)))
2148#endif
2149
2150/* Conversion */
2151/// Converts a vector of [4 x i32] into a vector of [4 x double].
2152///
2153/// \headerfile <x86intrin.h>
2154///
2155/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2156///
2157/// \param __a
2158/// A 128-bit integer vector of [4 x i32].
2159/// \returns A 256-bit vector of [4 x double] containing the converted values.
2160static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2162 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2163}
2164
2165/// Converts a vector of [8 x i32] into a vector of [8 x float].
2166///
2167/// \headerfile <x86intrin.h>
2168///
2169/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2170///
2171/// \param __a
2172/// A 256-bit integer vector.
2173/// \returns A 256-bit vector of [8 x float] containing the converted values.
2174static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2176 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2177}
2178
2179/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2180/// [4 x float].
2181///
2182/// \headerfile <x86intrin.h>
2183///
2184/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2185///
2186/// \param __a
2187/// A 256-bit vector of [4 x double].
2188/// \returns A 128-bit vector of [4 x float] containing the converted values.
2189static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2191 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2192}
2193
2194/// Converts a vector of [8 x float] into a vector of [8 x i32].
2195///
2196/// If a converted value does not fit in a 32-bit integer, raises a
2197/// floating-point invalid exception. If the exception is masked, returns
2198/// the most negative integer.
2199///
2200/// \headerfile <x86intrin.h>
2201///
2202/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2203///
2204/// \param __a
2205/// A 256-bit vector of [8 x float].
2206/// \returns A 256-bit integer vector containing the converted values.
2207static __inline __m256i __DEFAULT_FN_ATTRS
2209{
2210 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2211}
2212
2213/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2214/// x double].
2215///
2216/// \headerfile <x86intrin.h>
2217///
2218/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2219///
2220/// \param __a
2221/// A 128-bit vector of [4 x float].
2222/// \returns A 256-bit vector of [4 x double] containing the converted values.
2223static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2225 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2226}
2227
2228/// Converts a 256-bit vector of [4 x double] into four signed truncated
2229/// (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2230/// [4 x i32].
2231///
2232/// If a converted value does not fit in a 32-bit integer, raises a
2233/// floating-point invalid exception. If the exception is masked, returns
2234/// the most negative integer.
2235///
2236/// \headerfile <x86intrin.h>
2237///
2238/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2239///
2240/// \param __a
2241/// A 256-bit vector of [4 x double].
2242/// \returns A 128-bit integer vector containing the converted values.
2243static __inline __m128i __DEFAULT_FN_ATTRS
2245{
2246 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2247}
2248
2249/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2250/// [4 x i32].
2251///
2252/// If a converted value does not fit in a 32-bit integer, raises a
2253/// floating-point invalid exception. If the exception is masked, returns
2254/// the most negative integer.
2255///
2256/// \headerfile <x86intrin.h>
2257///
2258/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2259///
2260/// \param __a
2261/// A 256-bit vector of [4 x double].
2262/// \returns A 128-bit integer vector containing the converted values.
2263static __inline __m128i __DEFAULT_FN_ATTRS
2265{
2266 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2267}
2268
2269/// Converts a vector of [8 x float] into eight signed truncated (rounded
2270/// toward zero) 32-bit integers returned in a vector of [8 x i32].
2271///
2272/// If a converted value does not fit in a 32-bit integer, raises a
2273/// floating-point invalid exception. If the exception is masked, returns
2274/// the most negative integer.
2275///
2276/// \headerfile <x86intrin.h>
2277///
2278/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2279///
2280/// \param __a
2281/// A 256-bit vector of [8 x float].
2282/// \returns A 256-bit integer vector containing the converted values.
2283static __inline __m256i __DEFAULT_FN_ATTRS
2285{
2286 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2287}
2288
2289/// Returns the first element of the input vector of [4 x double].
2290///
2291/// \headerfile <x86intrin.h>
2292///
2293/// This intrinsic is a utility function and does not correspond to a specific
2294/// instruction.
2295///
2296/// \param __a
2297/// A 256-bit vector of [4 x double].
2298/// \returns A 64 bit double containing the first element of the input vector.
2299static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR
2301 return __a[0];
2302}
2303
2304/// Returns the first element of the input vector of [8 x i32].
2305///
2306/// \headerfile <x86intrin.h>
2307///
2308/// This intrinsic is a utility function and does not correspond to a specific
2309/// instruction.
2310///
2311/// \param __a
2312/// A 256-bit vector of [8 x i32].
2313/// \returns A 32 bit integer containing the first element of the input vector.
2314static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2316 __v8si __b = (__v8si)__a;
2317 return __b[0];
2318}
2319
2320/// Returns the first element of the input vector of [8 x float].
2321///
2322/// \headerfile <x86intrin.h>
2323///
2324/// This intrinsic is a utility function and does not correspond to a specific
2325/// instruction.
2326///
2327/// \param __a
2328/// A 256-bit vector of [8 x float].
2329/// \returns A 32 bit float containing the first element of the input vector.
2330static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR
2332 return __a[0];
2333}
2334
2335/* Vector replicate */
2336/// Moves and duplicates odd-indexed values from a 256-bit vector of
2337/// [8 x float] to float values in a 256-bit vector of [8 x float].
2338///
2339/// \headerfile <x86intrin.h>
2340///
2341/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2342///
2343/// \param __a
2344/// A 256-bit vector of [8 x float]. \n
2345/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2346/// the return value. \n
2347/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2348/// the return value. \n
2349/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2350/// return value. \n
2351/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2352/// return value.
2353/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2354/// values.
2355static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2357{
2358 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2359}
2360
2361/// Moves and duplicates even-indexed values from a 256-bit vector of
2362/// [8 x float] to float values in a 256-bit vector of [8 x float].
2363///
2364/// \headerfile <x86intrin.h>
2365///
2366/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2367///
2368/// \param __a
2369/// A 256-bit vector of [8 x float]. \n
2370/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2371/// the return value. \n
2372/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2373/// the return value. \n
2374/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2375/// return value. \n
2376/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2377/// return value.
2378/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2379/// values.
2380static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2382{
2383 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2384}
2385
2386/// Moves and duplicates double-precision floating point values from a
2387/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2388/// vector of [4 x double].
2389///
2390/// \headerfile <x86intrin.h>
2391///
2392/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2393///
2394/// \param __a
2395/// A 256-bit vector of [4 x double]. \n
2396/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2397/// return value. \n
2398/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2399/// the return value.
2400/// \returns A 256-bit vector of [4 x double] containing the moved and
2401/// duplicated values.
2402static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2404{
2405 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2406}
2407
2408/* Unpack and Interleave */
2409/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2410/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2411///
2412/// \headerfile <x86intrin.h>
2413///
2414/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2415///
2416/// \param __a
2417/// A 256-bit floating-point vector of [4 x double]. \n
2418/// Bits [127:64] are written to bits [63:0] of the return value. \n
2419/// Bits [255:192] are written to bits [191:128] of the return value. \n
2420/// \param __b
2421/// A 256-bit floating-point vector of [4 x double]. \n
2422/// Bits [127:64] are written to bits [127:64] of the return value. \n
2423/// Bits [255:192] are written to bits [255:192] of the return value. \n
2424/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2425static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2426_mm256_unpackhi_pd(__m256d __a, __m256d __b) {
2427 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2428}
2429
2430/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2431/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2432///
2433/// \headerfile <x86intrin.h>
2434///
2435/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2436///
2437/// \param __a
2438/// A 256-bit floating-point vector of [4 x double]. \n
2439/// Bits [63:0] are written to bits [63:0] of the return value. \n
2440/// Bits [191:128] are written to bits [191:128] of the return value.
2441/// \param __b
2442/// A 256-bit floating-point vector of [4 x double]. \n
2443/// Bits [63:0] are written to bits [127:64] of the return value. \n
2444/// Bits [191:128] are written to bits [255:192] of the return value. \n
2445/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2446static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
2447_mm256_unpacklo_pd(__m256d __a, __m256d __b) {
2448 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2449}
2450
2451/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2452/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2453/// vector of [8 x float].
2454///
2455/// \headerfile <x86intrin.h>
2456///
2457/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2458///
2459/// \param __a
2460/// A 256-bit vector of [8 x float]. \n
2461/// Bits [95:64] are written to bits [31:0] of the return value. \n
2462/// Bits [127:96] are written to bits [95:64] of the return value. \n
2463/// Bits [223:192] are written to bits [159:128] of the return value. \n
2464/// Bits [255:224] are written to bits [223:192] of the return value.
2465/// \param __b
2466/// A 256-bit vector of [8 x float]. \n
2467/// Bits [95:64] are written to bits [63:32] of the return value. \n
2468/// Bits [127:96] are written to bits [127:96] of the return value. \n
2469/// Bits [223:192] are written to bits [191:160] of the return value. \n
2470/// Bits [255:224] are written to bits [255:224] of the return value.
2471/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2472static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2473_mm256_unpackhi_ps(__m256 __a, __m256 __b) {
2474 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2475}
2476
2477/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2478/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2479/// vector of [8 x float].
2480///
2481/// \headerfile <x86intrin.h>
2482///
2483/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2484///
2485/// \param __a
2486/// A 256-bit vector of [8 x float]. \n
2487/// Bits [31:0] are written to bits [31:0] of the return value. \n
2488/// Bits [63:32] are written to bits [95:64] of the return value. \n
2489/// Bits [159:128] are written to bits [159:128] of the return value. \n
2490/// Bits [191:160] are written to bits [223:192] of the return value.
2491/// \param __b
2492/// A 256-bit vector of [8 x float]. \n
2493/// Bits [31:0] are written to bits [63:32] of the return value. \n
2494/// Bits [63:32] are written to bits [127:96] of the return value. \n
2495/// Bits [159:128] are written to bits [191:160] of the return value. \n
2496/// Bits [191:160] are written to bits [255:224] of the return value.
2497/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2498static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
2499_mm256_unpacklo_ps(__m256 __a, __m256 __b) {
2500 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2501}
2502
2503/* Bit Test */
2504/// Given two 128-bit floating-point vectors of [2 x double], perform an
2505/// element-by-element comparison of the double-precision element in the
2506/// first source vector and the corresponding element in the second source
2507/// vector.
2508///
2509/// The EFLAGS register is updated as follows: \n
2510/// If there is at least one pair of double-precision elements where the
2511/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2512/// ZF flag is set to 1. \n
2513/// If there is at least one pair of double-precision elements where the
2514/// sign-bit of the first element is 0 and the sign-bit of the second element
2515/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2516/// This intrinsic returns the value of the ZF flag.
2517///
2518/// \headerfile <x86intrin.h>
2519///
2520/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2521///
2522/// \param __a
2523/// A 128-bit vector of [2 x double].
2524/// \param __b
2525/// A 128-bit vector of [2 x double].
2526/// \returns the ZF flag in the EFLAGS register.
2528 __m128d __b) {
2529 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2530}
2531
2532/// Given two 128-bit floating-point vectors of [2 x double], perform an
2533/// element-by-element comparison of the double-precision element in the
2534/// first source vector and the corresponding element in the second source
2535/// vector.
2536///
2537/// The EFLAGS register is updated as follows: \n
2538/// If there is at least one pair of double-precision elements where the
2539/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2540/// ZF flag is set to 1. \n
2541/// If there is at least one pair of double-precision elements where the
2542/// sign-bit of the first element is 0 and the sign-bit of the second element
2543/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2544/// This intrinsic returns the value of the CF flag.
2545///
2546/// \headerfile <x86intrin.h>
2547///
2548/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2549///
2550/// \param __a
2551/// A 128-bit vector of [2 x double].
2552/// \param __b
2553/// A 128-bit vector of [2 x double].
2554/// \returns the CF flag in the EFLAGS register.
2556 __m128d __b) {
2557 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2558}
2559
2560/// Given two 128-bit floating-point vectors of [2 x double], perform an
2561/// element-by-element comparison of the double-precision element in the
2562/// first source vector and the corresponding element in the second source
2563/// vector.
2564///
2565/// The EFLAGS register is updated as follows: \n
2566/// If there is at least one pair of double-precision elements where the
2567/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2568/// ZF flag is set to 1. \n
2569/// If there is at least one pair of double-precision elements where the
2570/// sign-bit of the first element is 0 and the sign-bit of the second element
2571/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2572/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2573/// otherwise it returns 0.
2574///
2575/// \headerfile <x86intrin.h>
2576///
2577/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2578///
2579/// \param __a
2580/// A 128-bit vector of [2 x double].
2581/// \param __b
2582/// A 128-bit vector of [2 x double].
2583/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2584static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR
2585_mm_testnzc_pd(__m128d __a, __m128d __b) {
2586 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2587}
2588
2589/// Given two 128-bit floating-point vectors of [4 x float], perform an
2590/// element-by-element comparison of the single-precision element in the
2591/// first source vector and the corresponding element in the second source
2592/// vector.
2593///
2594/// The EFLAGS register is updated as follows: \n
2595/// If there is at least one pair of single-precision elements where the
2596/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2597/// ZF flag is set to 1. \n
2598/// If there is at least one pair of single-precision elements where the
2599/// sign-bit of the first element is 0 and the sign-bit of the second element
2600/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2601/// This intrinsic returns the value of the ZF flag.
2602///
2603/// \headerfile <x86intrin.h>
2604///
2605/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2606///
2607/// \param __a
2608/// A 128-bit vector of [4 x float].
2609/// \param __b
2610/// A 128-bit vector of [4 x float].
2611/// \returns the ZF flag.
2613 __m128 __b) {
2614 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2615}
2616
2617/// Given two 128-bit floating-point vectors of [4 x float], perform an
2618/// element-by-element comparison of the single-precision element in the
2619/// first source vector and the corresponding element in the second source
2620/// vector.
2621///
2622/// The EFLAGS register is updated as follows: \n
2623/// If there is at least one pair of single-precision elements where the
2624/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2625/// ZF flag is set to 1. \n
2626/// If there is at least one pair of single-precision elements where the
2627/// sign-bit of the first element is 0 and the sign-bit of the second element
2628/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2629/// This intrinsic returns the value of the CF flag.
2630///
2631/// \headerfile <x86intrin.h>
2632///
2633/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2634///
2635/// \param __a
2636/// A 128-bit vector of [4 x float].
2637/// \param __b
2638/// A 128-bit vector of [4 x float].
2639/// \returns the CF flag.
2641 __m128 __b) {
2642 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2643}
2644
2645/// Given two 128-bit floating-point vectors of [4 x float], perform an
2646/// element-by-element comparison of the single-precision element in the
2647/// first source vector and the corresponding element in the second source
2648/// vector.
2649///
2650/// The EFLAGS register is updated as follows: \n
2651/// If there is at least one pair of single-precision elements where the
2652/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2653/// ZF flag is set to 1. \n
2654/// If there is at least one pair of single-precision elements where the
2655/// sign-bit of the first element is 0 and the sign-bit of the second element
2656/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2657/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2658/// otherwise it returns 0.
2659///
2660/// \headerfile <x86intrin.h>
2661///
2662/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2663///
2664/// \param __a
2665/// A 128-bit vector of [4 x float].
2666/// \param __b
2667/// A 128-bit vector of [4 x float].
2668/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2670 __m128 __b) {
2671 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2672}
2673
2674/// Given two 256-bit floating-point vectors of [4 x double], perform an
2675/// element-by-element comparison of the double-precision elements in the
2676/// first source vector and the corresponding elements in the second source
2677/// vector.
2678///
2679/// The EFLAGS register is updated as follows: \n
2680/// If there is at least one pair of double-precision elements where the
2681/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2682/// ZF flag is set to 1. \n
2683/// If there is at least one pair of double-precision elements where the
2684/// sign-bit of the first element is 0 and the sign-bit of the second element
2685/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2686/// This intrinsic returns the value of the ZF flag.
2687///
2688/// \headerfile <x86intrin.h>
2689///
2690/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2691///
2692/// \param __a
2693/// A 256-bit vector of [4 x double].
2694/// \param __b
2695/// A 256-bit vector of [4 x double].
2696/// \returns the ZF flag.
2698 __m256d __b) {
2699 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2700}
2701
2702/// Given two 256-bit floating-point vectors of [4 x double], perform an
2703/// element-by-element comparison of the double-precision elements in the
2704/// first source vector and the corresponding elements in the second source
2705/// vector.
2706///
2707/// The EFLAGS register is updated as follows: \n
2708/// If there is at least one pair of double-precision elements where the
2709/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2710/// ZF flag is set to 1. \n
2711/// If there is at least one pair of double-precision elements where the
2712/// sign-bit of the first element is 0 and the sign-bit of the second element
2713/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2714/// This intrinsic returns the value of the CF flag.
2715///
2716/// \headerfile <x86intrin.h>
2717///
2718/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2719///
2720/// \param __a
2721/// A 256-bit vector of [4 x double].
2722/// \param __b
2723/// A 256-bit vector of [4 x double].
2724/// \returns the CF flag.
2726 __m256d __b) {
2727 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2728}
2729
2730/// Given two 256-bit floating-point vectors of [4 x double], perform an
2731/// element-by-element comparison of the double-precision elements in the
2732/// first source vector and the corresponding elements in the second source
2733/// vector.
2734///
2735/// The EFLAGS register is updated as follows: \n
2736/// If there is at least one pair of double-precision elements where the
2737/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2738/// ZF flag is set to 1. \n
2739/// If there is at least one pair of double-precision elements where the
2740/// sign-bit of the first element is 0 and the sign-bit of the second element
2741/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2742/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2743/// otherwise it returns 0.
2744///
2745/// \headerfile <x86intrin.h>
2746///
2747/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2748///
2749/// \param __a
2750/// A 256-bit vector of [4 x double].
2751/// \param __b
2752/// A 256-bit vector of [4 x double].
2753/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2754static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2755_mm256_testnzc_pd(__m256d __a, __m256d __b) {
2756 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2757}
2758
2759/// Given two 256-bit floating-point vectors of [8 x float], perform an
2760/// element-by-element comparison of the single-precision element in the
2761/// first source vector and the corresponding element in the second source
2762/// vector.
2763///
2764/// The EFLAGS register is updated as follows: \n
2765/// If there is at least one pair of single-precision elements where the
2766/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2767/// ZF flag is set to 1. \n
2768/// If there is at least one pair of single-precision elements where the
2769/// sign-bit of the first element is 0 and the sign-bit of the second element
2770/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2771/// This intrinsic returns the value of the ZF flag.
2772///
2773/// \headerfile <x86intrin.h>
2774///
2775/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2776///
2777/// \param __a
2778/// A 256-bit vector of [8 x float].
2779/// \param __b
2780/// A 256-bit vector of [8 x float].
2781/// \returns the ZF flag.
2783 __m256 __b) {
2784 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2785}
2786
2787/// Given two 256-bit floating-point vectors of [8 x float], perform an
2788/// element-by-element comparison of the single-precision element in the
2789/// first source vector and the corresponding element in the second source
2790/// vector.
2791///
2792/// The EFLAGS register is updated as follows: \n
2793/// If there is at least one pair of single-precision elements where the
2794/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2795/// ZF flag is set to 1. \n
2796/// If there is at least one pair of single-precision elements where the
2797/// sign-bit of the first element is 0 and the sign-bit of the second element
2798/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2799/// This intrinsic returns the value of the CF flag.
2800///
2801/// \headerfile <x86intrin.h>
2802///
2803/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2804///
2805/// \param __a
2806/// A 256-bit vector of [8 x float].
2807/// \param __b
2808/// A 256-bit vector of [8 x float].
2809/// \returns the CF flag.
2811 __m256 __b) {
2812 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2813}
2814
2815/// Given two 256-bit floating-point vectors of [8 x float], perform an
2816/// element-by-element comparison of the single-precision elements in the
2817/// first source vector and the corresponding elements in the second source
2818/// vector.
2819///
2820/// The EFLAGS register is updated as follows: \n
2821/// If there is at least one pair of single-precision elements where the
2822/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2823/// ZF flag is set to 1. \n
2824/// If there is at least one pair of single-precision elements where the
2825/// sign-bit of the first element is 0 and the sign-bit of the second element
2826/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2827/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2828/// otherwise it returns 0.
2829///
2830/// \headerfile <x86intrin.h>
2831///
2832/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2833///
2834/// \param __a
2835/// A 256-bit vector of [8 x float].
2836/// \param __b
2837/// A 256-bit vector of [8 x float].
2838/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2840 __m256 __b) {
2841 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2842}
2843
2844/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2845/// of the two source vectors.
2846///
2847/// The EFLAGS register is updated as follows: \n
2848/// If there is at least one pair of bits where both bits are 1, the ZF flag
2849/// is set to 0. Otherwise the ZF flag is set to 1. \n
2850/// If there is at least one pair of bits where the bit from the first source
2851/// vector is 0 and the bit from the second source vector is 1, the CF flag
2852/// is set to 0. Otherwise the CF flag is set to 1. \n
2853/// This intrinsic returns the value of the ZF flag.
2854///
2855/// \headerfile <x86intrin.h>
2856///
2857/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2858///
2859/// \param __a
2860/// A 256-bit integer vector.
2861/// \param __b
2862/// A 256-bit integer vector.
2863/// \returns the ZF flag.
2864static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2865_mm256_testz_si256(__m256i __a, __m256i __b) {
2866 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2867}
2868
2869/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2870/// of the two source vectors.
2871///
2872/// The EFLAGS register is updated as follows: \n
2873/// If there is at least one pair of bits where both bits are 1, the ZF flag
2874/// is set to 0. Otherwise the ZF flag is set to 1. \n
2875/// If there is at least one pair of bits where the bit from the first source
2876/// vector is 0 and the bit from the second source vector is 1, the CF flag
2877/// is set to 0. Otherwise the CF flag is set to 1. \n
2878/// This intrinsic returns the value of the CF flag.
2879///
2880/// \headerfile <x86intrin.h>
2881///
2882/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2883///
2884/// \param __a
2885/// A 256-bit integer vector.
2886/// \param __b
2887/// A 256-bit integer vector.
2888/// \returns the CF flag.
2889static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2890_mm256_testc_si256(__m256i __a, __m256i __b) {
2891 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2892}
2893
2894/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2895/// of the two source vectors.
2896///
2897/// The EFLAGS register is updated as follows: \n
2898/// If there is at least one pair of bits where both bits are 1, the ZF flag
2899/// is set to 0. Otherwise the ZF flag is set to 1. \n
2900/// If there is at least one pair of bits where the bit from the first source
2901/// vector is 0 and the bit from the second source vector is 1, the CF flag
2902/// is set to 0. Otherwise the CF flag is set to 1. \n
2903/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2904/// otherwise it returns 0.
2905///
2906/// \headerfile <x86intrin.h>
2907///
2908/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2909///
2910/// \param __a
2911/// A 256-bit integer vector.
2912/// \param __b
2913/// A 256-bit integer vector.
2914/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2915static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2916_mm256_testnzc_si256(__m256i __a, __m256i __b) {
2917 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2918}
2919
2920/* Vector extract sign mask */
2921/// Extracts the sign bits of double-precision floating point elements
2922/// in a 256-bit vector of [4 x double] and writes them to the lower order
2923/// bits of the return value.
2924///
2925/// \headerfile <x86intrin.h>
2926///
2927/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2928///
2929/// \param __a
2930/// A 256-bit vector of [4 x double] containing the double-precision
2931/// floating point values with sign bits to be extracted.
2932/// \returns The sign bits from the operand, written to bits [3:0].
2933static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2935 return __builtin_ia32_movmskpd256((__v4df)__a);
2936}
2937
2938/// Extracts the sign bits of single-precision floating point elements
2939/// in a 256-bit vector of [8 x float] and writes them to the lower order
2940/// bits of the return value.
2941///
2942/// \headerfile <x86intrin.h>
2943///
2944/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2945///
2946/// \param __a
2947/// A 256-bit vector of [8 x float] containing the single-precision floating
2948/// point values with sign bits to be extracted.
2949/// \returns The sign bits from the operand, written to bits [7:0].
2950static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
2952 return __builtin_ia32_movmskps256((__v8sf)__a);
2953}
2954
2955/* Vector __zero */
2956/// Zeroes the contents of all XMM or YMM registers.
2957///
2958/// \headerfile <x86intrin.h>
2959///
2960/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2961static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2962_mm256_zeroall(void)
2963{
2964 __builtin_ia32_vzeroall();
2965}
2966
2967/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2968///
2969/// \headerfile <x86intrin.h>
2970///
2971/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2972static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2973_mm256_zeroupper(void)
2974{
2975 __builtin_ia32_vzeroupper();
2976}
2977
2978/* Vector load with broadcast */
2979/// Loads a scalar single-precision floating point value from the
2980/// specified address pointed to by \a __a and broadcasts it to the elements
2981/// of a [4 x float] vector.
2982///
2983/// \headerfile <x86intrin.h>
2984///
2985/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2986///
2987/// \param __a
2988/// The single-precision floating point value to be broadcast.
2989/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2990/// equal to the broadcast value.
2991static __inline __m128 __DEFAULT_FN_ATTRS128
2993{
2994 struct __mm_broadcast_ss_struct {
2995 float __f;
2996 } __attribute__((__packed__, __may_alias__));
2997 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
2998 return __extension__ (__m128){ __f, __f, __f, __f };
2999}
3000
3001/// Loads a scalar double-precision floating point value from the
3002/// specified address pointed to by \a __a and broadcasts it to the elements
3003/// of a [4 x double] vector.
3004///
3005/// \headerfile <x86intrin.h>
3006///
3007/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3008///
3009/// \param __a
3010/// The double-precision floating point value to be broadcast.
3011/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3012/// equal to the broadcast value.
3013static __inline __m256d __DEFAULT_FN_ATTRS
3015{
3016 struct __mm256_broadcast_sd_struct {
3017 double __d;
3018 } __attribute__((__packed__, __may_alias__));
3019 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3020 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3021}
3022
3023/// Loads a scalar single-precision floating point value from the
3024/// specified address pointed to by \a __a and broadcasts it to the elements
3025/// of a [8 x float] vector.
3026///
3027/// \headerfile <x86intrin.h>
3028///
3029/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3030///
3031/// \param __a
3032/// The single-precision floating point value to be broadcast.
3033/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3034/// equal to the broadcast value.
3035static __inline __m256 __DEFAULT_FN_ATTRS
3037{
3038 struct __mm256_broadcast_ss_struct {
3039 float __f;
3040 } __attribute__((__packed__, __may_alias__));
3041 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3042 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3043}
3044
3045/// Loads the data from a 128-bit vector of [2 x double] from the
3046/// specified address pointed to by \a __a and broadcasts it to 128-bit
3047/// elements in a 256-bit vector of [4 x double].
3048///
3049/// \headerfile <x86intrin.h>
3050///
3051/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3052///
3053/// \param __a
3054/// The 128-bit vector of [2 x double] to be broadcast.
3055/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3056/// equal to the broadcast value.
3057static __inline __m256d __DEFAULT_FN_ATTRS
3059{
3060 __m128d __b = _mm_loadu_pd((const double *)__a);
3061 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3062 0, 1, 0, 1);
3063}
3064
3065/// Loads the data from a 128-bit vector of [4 x float] from the
3066/// specified address pointed to by \a __a and broadcasts it to 128-bit
3067/// elements in a 256-bit vector of [8 x float].
3068///
3069/// \headerfile <x86intrin.h>
3070///
3071/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3072///
3073/// \param __a
3074/// The 128-bit vector of [4 x float] to be broadcast.
3075/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3076/// equal to the broadcast value.
3077static __inline __m256 __DEFAULT_FN_ATTRS
3079{
3080 __m128 __b = _mm_loadu_ps((const float *)__a);
3081 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3082 0, 1, 2, 3, 0, 1, 2, 3);
3083}
3084
3085/* SIMD load ops */
3086/// Loads 4 double-precision floating point values from a 32-byte aligned
3087/// memory location pointed to by \a __p into a vector of [4 x double].
3088///
3089/// \headerfile <x86intrin.h>
3090///
3091/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3092///
3093/// \param __p
3094/// A 32-byte aligned pointer to a memory location containing
3095/// double-precision floating point values.
3096/// \returns A 256-bit vector of [4 x double] containing the moved values.
3097static __inline __m256d __DEFAULT_FN_ATTRS
3098_mm256_load_pd(double const *__p)
3099{
3100 return *(const __m256d *)__p;
3101}
3102
3103/// Loads 8 single-precision floating point values from a 32-byte aligned
3104/// memory location pointed to by \a __p into a vector of [8 x float].
3105///
3106/// \headerfile <x86intrin.h>
3107///
3108/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3109///
3110/// \param __p
3111/// A 32-byte aligned pointer to a memory location containing float values.
3112/// \returns A 256-bit vector of [8 x float] containing the moved values.
3113static __inline __m256 __DEFAULT_FN_ATTRS
3114_mm256_load_ps(float const *__p)
3115{
3116 return *(const __m256 *)__p;
3117}
3118
3119/// Loads 4 double-precision floating point values from an unaligned
3120/// memory location pointed to by \a __p into a vector of [4 x double].
3121///
3122/// \headerfile <x86intrin.h>
3123///
3124/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3125///
3126/// \param __p
3127/// A pointer to a memory location containing double-precision floating
3128/// point values.
3129/// \returns A 256-bit vector of [4 x double] containing the moved values.
3130static __inline __m256d __DEFAULT_FN_ATTRS
3131_mm256_loadu_pd(double const *__p)
3132{
3133 struct __loadu_pd {
3134 __m256d_u __v;
3135 } __attribute__((__packed__, __may_alias__));
3136 return ((const struct __loadu_pd*)__p)->__v;
3137}
3138
3139/// Loads 8 single-precision floating point values from an unaligned
3140/// memory location pointed to by \a __p into a vector of [8 x float].
3141///
3142/// \headerfile <x86intrin.h>
3143///
3144/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3145///
3146/// \param __p
3147/// A pointer to a memory location containing single-precision floating
3148/// point values.
3149/// \returns A 256-bit vector of [8 x float] containing the moved values.
3150static __inline __m256 __DEFAULT_FN_ATTRS
3152{
3153 struct __loadu_ps {
3154 __m256_u __v;
3155 } __attribute__((__packed__, __may_alias__));
3156 return ((const struct __loadu_ps*)__p)->__v;
3157}
3158
3159/// Loads 256 bits of integer data from a 32-byte aligned memory
3160/// location pointed to by \a __p into elements of a 256-bit integer vector.
3161///
3162/// \headerfile <x86intrin.h>
3163///
3164/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3165///
3166/// \param __p
3167/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3168/// values.
3169/// \returns A 256-bit integer vector containing the moved values.
3170static __inline __m256i __DEFAULT_FN_ATTRS
3171_mm256_load_si256(__m256i const *__p)
3172{
3173 return *__p;
3174}
3175
3176/// Loads 256 bits of integer data from an unaligned memory location
3177/// pointed to by \a __p into a 256-bit integer vector.
3178///
3179/// \headerfile <x86intrin.h>
3180///
3181/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3182///
3183/// \param __p
3184/// A pointer to a 256-bit integer vector containing integer values.
3185/// \returns A 256-bit integer vector containing the moved values.
3186static __inline __m256i __DEFAULT_FN_ATTRS
3187_mm256_loadu_si256(__m256i_u const *__p)
3188{
3189 struct __loadu_si256 {
3190 __m256i_u __v;
3191 } __attribute__((__packed__, __may_alias__));
3192 return ((const struct __loadu_si256*)__p)->__v;
3193}
3194
3195/// Loads 256 bits of integer data from an unaligned memory location
3196/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3197/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3198/// line boundary.
3199///
3200/// \headerfile <x86intrin.h>
3201///
3202/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3203///
3204/// \param __p
3205/// A pointer to a 256-bit integer vector containing integer values.
3206/// \returns A 256-bit integer vector containing the moved values.
3207static __inline __m256i __DEFAULT_FN_ATTRS
3208_mm256_lddqu_si256(__m256i_u const *__p)
3209{
3210 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3211}
3212
3213/* SIMD store ops */
3214/// Stores double-precision floating point values from a 256-bit vector
3215/// of [4 x double] to a 32-byte aligned memory location pointed to by
3216/// \a __p.
3217///
3218/// \headerfile <x86intrin.h>
3219///
3220/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3221///
3222/// \param __p
3223/// A 32-byte aligned pointer to a memory location that will receive the
3224/// double-precision floaing point values.
3225/// \param __a
3226/// A 256-bit vector of [4 x double] containing the values to be moved.
3227static __inline void __DEFAULT_FN_ATTRS
3228_mm256_store_pd(double *__p, __m256d __a)
3229{
3230 *(__m256d *)__p = __a;
3231}
3232
3233/// Stores single-precision floating point values from a 256-bit vector
3234/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3235///
3236/// \headerfile <x86intrin.h>
3237///
3238/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3239///
3240/// \param __p
3241/// A 32-byte aligned pointer to a memory location that will receive the
3242/// float values.
3243/// \param __a
3244/// A 256-bit vector of [8 x float] containing the values to be moved.
3245static __inline void __DEFAULT_FN_ATTRS
3246_mm256_store_ps(float *__p, __m256 __a)
3247{
3248 *(__m256 *)__p = __a;
3249}
3250
3251/// Stores double-precision floating point values from a 256-bit vector
3252/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3253///
3254/// \headerfile <x86intrin.h>
3255///
3256/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3257///
3258/// \param __p
3259/// A pointer to a memory location that will receive the double-precision
3260/// floating point values.
3261/// \param __a
3262/// A 256-bit vector of [4 x double] containing the values to be moved.
3263static __inline void __DEFAULT_FN_ATTRS
3264_mm256_storeu_pd(double *__p, __m256d __a)
3265{
3266 struct __storeu_pd {
3267 __m256d_u __v;
3268 } __attribute__((__packed__, __may_alias__));
3269 ((struct __storeu_pd*)__p)->__v = __a;
3270}
3271
3272/// Stores single-precision floating point values from a 256-bit vector
3273/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3274///
3275/// \headerfile <x86intrin.h>
3276///
3277/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3278///
3279/// \param __p
3280/// A pointer to a memory location that will receive the float values.
3281/// \param __a
3282/// A 256-bit vector of [8 x float] containing the values to be moved.
3283static __inline void __DEFAULT_FN_ATTRS
3284_mm256_storeu_ps(float *__p, __m256 __a)
3285{
3286 struct __storeu_ps {
3287 __m256_u __v;
3288 } __attribute__((__packed__, __may_alias__));
3289 ((struct __storeu_ps*)__p)->__v = __a;
3290}
3291
3292/// Stores integer values from a 256-bit integer vector to a 32-byte
3293/// aligned memory location pointed to by \a __p.
3294///
3295/// \headerfile <x86intrin.h>
3296///
3297/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3298///
3299/// \param __p
3300/// A 32-byte aligned pointer to a memory location that will receive the
3301/// integer values.
3302/// \param __a
3303/// A 256-bit integer vector containing the values to be moved.
3304static __inline void __DEFAULT_FN_ATTRS
3305_mm256_store_si256(__m256i *__p, __m256i __a)
3306{
3307 *__p = __a;
3308}
3309
3310/// Stores integer values from a 256-bit integer vector to an unaligned
3311/// memory location pointed to by \a __p.
3312///
3313/// \headerfile <x86intrin.h>
3314///
3315/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3316///
3317/// \param __p
3318/// A pointer to a memory location that will receive the integer values.
3319/// \param __a
3320/// A 256-bit integer vector containing the values to be moved.
3321static __inline void __DEFAULT_FN_ATTRS
3322_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3323{
3324 struct __storeu_si256 {
3325 __m256i_u __v;
3326 } __attribute__((__packed__, __may_alias__));
3327 ((struct __storeu_si256*)__p)->__v = __a;
3328}
3329
3330/* Conditional load ops */
3331/// Conditionally loads double-precision floating point elements from a
3332/// memory location pointed to by \a __p into a 128-bit vector of
3333/// [2 x double], depending on the mask bits associated with each data
3334/// element.
3335///
3336/// \headerfile <x86intrin.h>
3337///
3338/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3339///
3340/// \param __p
3341/// A pointer to a memory location that contains the double-precision
3342/// floating point values.
3343/// \param __m
3344/// A 128-bit integer vector containing the mask. The most significant bit of
3345/// each data element represents the mask bits. If a mask bit is zero, the
3346/// corresponding value in the memory location is not loaded and the
3347/// corresponding field in the return value is set to zero.
3348/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3349static __inline __m128d __DEFAULT_FN_ATTRS128
3350_mm_maskload_pd(double const *__p, __m128i __m)
3351{
3352 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3353}
3354
3355/// Conditionally loads double-precision floating point elements from a
3356/// memory location pointed to by \a __p into a 256-bit vector of
3357/// [4 x double], depending on the mask bits associated with each data
3358/// element.
3359///
3360/// \headerfile <x86intrin.h>
3361///
3362/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3363///
3364/// \param __p
3365/// A pointer to a memory location that contains the double-precision
3366/// floating point values.
3367/// \param __m
3368/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3369/// significant bit of each quadword element represents the mask bits. If a
3370/// mask bit is zero, the corresponding value in the memory location is not
3371/// loaded and the corresponding field in the return value is set to zero.
3372/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3373static __inline __m256d __DEFAULT_FN_ATTRS
3374_mm256_maskload_pd(double const *__p, __m256i __m)
3375{
3376 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3377 (__v4di)__m);
3378}
3379
3380/// Conditionally loads single-precision floating point elements from a
3381/// memory location pointed to by \a __p into a 128-bit vector of
3382/// [4 x float], depending on the mask bits associated with each data
3383/// element.
3384///
3385/// \headerfile <x86intrin.h>
3386///
3387/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3388///
3389/// \param __p
3390/// A pointer to a memory location that contains the single-precision
3391/// floating point values.
3392/// \param __m
3393/// A 128-bit integer vector containing the mask. The most significant bit of
3394/// each data element represents the mask bits. If a mask bit is zero, the
3395/// corresponding value in the memory location is not loaded and the
3396/// corresponding field in the return value is set to zero.
3397/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3398static __inline __m128 __DEFAULT_FN_ATTRS128
3399_mm_maskload_ps(float const *__p, __m128i __m)
3400{
3401 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3402}
3403
3404/// Conditionally loads single-precision floating point elements from a
3405/// memory location pointed to by \a __p into a 256-bit vector of
3406/// [8 x float], depending on the mask bits associated with each data
3407/// element.
3408///
3409/// \headerfile <x86intrin.h>
3410///
3411/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3412///
3413/// \param __p
3414/// A pointer to a memory location that contains the single-precision
3415/// floating point values.
3416/// \param __m
3417/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3418/// significant bit of each dword element represents the mask bits. If a mask
3419/// bit is zero, the corresponding value in the memory location is not loaded
3420/// and the corresponding field in the return value is set to zero.
3421/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3422static __inline __m256 __DEFAULT_FN_ATTRS
3423_mm256_maskload_ps(float const *__p, __m256i __m)
3424{
3425 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3426}
3427
3428/* Conditional store ops */
3429/// Moves single-precision floating point values from a 256-bit vector
3430/// of [8 x float] to a memory location pointed to by \a __p, according to
3431/// the specified mask.
3432///
3433/// \headerfile <x86intrin.h>
3434///
3435/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3436///
3437/// \param __p
3438/// A pointer to a memory location that will receive the float values.
3439/// \param __m
3440/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3441/// significant bit of each dword element in the mask vector represents the
3442/// mask bits. If a mask bit is zero, the corresponding value from vector
3443/// \a __a is not stored and the corresponding field in the memory location
3444/// pointed to by \a __p is not changed.
3445/// \param __a
3446/// A 256-bit vector of [8 x float] containing the values to be stored.
3447static __inline void __DEFAULT_FN_ATTRS
3448_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3449{
3450 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3451}
3452
3453/// Moves double-precision values from a 128-bit vector of [2 x double]
3454/// to a memory location pointed to by \a __p, according to the specified
3455/// mask.
3456///
3457/// \headerfile <x86intrin.h>
3458///
3459/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3460///
3461/// \param __p
3462/// A pointer to a memory location that will receive the float values.
3463/// \param __m
3464/// A 128-bit integer vector containing the mask. The most significant bit of
3465/// each field in the mask vector represents the mask bits. If a mask bit is
3466/// zero, the corresponding value from vector \a __a is not stored and the
3467/// corresponding field in the memory location pointed to by \a __p is not
3468/// changed.
3469/// \param __a
3470/// A 128-bit vector of [2 x double] containing the values to be stored.
3471static __inline void __DEFAULT_FN_ATTRS128
3472_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3473{
3474 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3475}
3476
3477/// Moves double-precision values from a 256-bit vector of [4 x double]
3478/// to a memory location pointed to by \a __p, according to the specified
3479/// mask.
3480///
3481/// \headerfile <x86intrin.h>
3482///
3483/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3484///
3485/// \param __p
3486/// A pointer to a memory location that will receive the float values.
3487/// \param __m
3488/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3489/// significant bit of each quadword element in the mask vector represents
3490/// the mask bits. If a mask bit is zero, the corresponding value from vector
3491/// __a is not stored and the corresponding field in the memory location
3492/// pointed to by \a __p is not changed.
3493/// \param __a
3494/// A 256-bit vector of [4 x double] containing the values to be stored.
3495static __inline void __DEFAULT_FN_ATTRS
3496_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3497{
3498 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3499}
3500
3501/// Moves single-precision floating point values from a 128-bit vector
3502/// of [4 x float] to a memory location pointed to by \a __p, according to
3503/// the specified mask.
3504///
3505/// \headerfile <x86intrin.h>
3506///
3507/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3508///
3509/// \param __p
3510/// A pointer to a memory location that will receive the float values.
3511/// \param __m
3512/// A 128-bit integer vector containing the mask. The most significant bit of
3513/// each field in the mask vector represents the mask bits. If a mask bit is
3514/// zero, the corresponding value from vector __a is not stored and the
3515/// corresponding field in the memory location pointed to by \a __p is not
3516/// changed.
3517/// \param __a
3518/// A 128-bit vector of [4 x float] containing the values to be stored.
3519static __inline void __DEFAULT_FN_ATTRS128
3520_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3521{
3522 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3523}
3524
3525/* Cacheability support ops */
3526/// Moves integer data from a 256-bit integer vector to a 32-byte
3527/// aligned memory location. To minimize caching, the data is flagged as
3528/// non-temporal (unlikely to be used again soon).
3529///
3530/// \headerfile <x86intrin.h>
3531///
3532/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3533///
3534/// \param __a
3535/// A pointer to a 32-byte aligned memory location that will receive the
3536/// integer values.
3537/// \param __b
3538/// A 256-bit integer vector containing the values to be moved.
3539static __inline void __DEFAULT_FN_ATTRS
3541{
3542 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3543 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3544}
3545
3546/// Moves double-precision values from a 256-bit vector of [4 x double]
3547/// to a 32-byte aligned memory location. To minimize caching, the data is
3548/// flagged as non-temporal (unlikely to be used again soon).
3549///
3550/// \headerfile <x86intrin.h>
3551///
3552/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3553///
3554/// \param __a
3555/// A pointer to a 32-byte aligned memory location that will receive the
3556/// double-precision floating-point values.
3557/// \param __b
3558/// A 256-bit vector of [4 x double] containing the values to be moved.
3559static __inline void __DEFAULT_FN_ATTRS
3560_mm256_stream_pd(void *__a, __m256d __b)
3561{
3562 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3563 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3564}
3565
3566/// Moves single-precision floating point values from a 256-bit vector
3567/// of [8 x float] to a 32-byte aligned memory location. To minimize
3568/// caching, the data is flagged as non-temporal (unlikely to be used again
3569/// soon).
3570///
3571/// \headerfile <x86intrin.h>
3572///
3573/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3574///
3575/// \param __p
3576/// A pointer to a 32-byte aligned memory location that will receive the
3577/// single-precision floating point values.
3578/// \param __a
3579/// A 256-bit vector of [8 x float] containing the values to be moved.
3580static __inline void __DEFAULT_FN_ATTRS
3582{
3583 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3584 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3585}
3586
3587/* Create vectors */
3588/// Create a 256-bit vector of [4 x double] with undefined values.
3589///
3590/// \headerfile <x86intrin.h>
3591///
3592/// This intrinsic has no corresponding instruction.
3593///
3594/// \returns A 256-bit vector of [4 x double] containing undefined values.
3595static __inline__ __m256d __DEFAULT_FN_ATTRS
3597{
3598 return (__m256d)__builtin_ia32_undef256();
3599}
3600
3601/// Create a 256-bit vector of [8 x float] with undefined values.
3602///
3603/// \headerfile <x86intrin.h>
3604///
3605/// This intrinsic has no corresponding instruction.
3606///
3607/// \returns A 256-bit vector of [8 x float] containing undefined values.
3608static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void) {
3609 return (__m256)__builtin_ia32_undef256();
3610}
3611
3612/// Create a 256-bit integer vector with undefined values.
3613///
3614/// \headerfile <x86intrin.h>
3615///
3616/// This intrinsic has no corresponding instruction.
3617///
3618/// \returns A 256-bit integer vector containing undefined values.
3619static __inline__ __m256i __DEFAULT_FN_ATTRS
3621{
3622 return (__m256i)__builtin_ia32_undef256();
3623}
3624
3625/// Constructs a 256-bit floating-point vector of [4 x double]
3626/// initialized with the specified double-precision floating-point values.
3627///
3628/// \headerfile <x86intrin.h>
3629///
3630/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3631/// instruction.
3632///
3633/// \param __a
3634/// A double-precision floating-point value used to initialize bits [255:192]
3635/// of the result.
3636/// \param __b
3637/// A double-precision floating-point value used to initialize bits [191:128]
3638/// of the result.
3639/// \param __c
3640/// A double-precision floating-point value used to initialize bits [127:64]
3641/// of the result.
3642/// \param __d
3643/// A double-precision floating-point value used to initialize bits [63:0]
3644/// of the result.
3645/// \returns An initialized 256-bit floating-point vector of [4 x double].
3646static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3647_mm256_set_pd(double __a, double __b, double __c, double __d)
3648{
3649 return __extension__ (__m256d){ __d, __c, __b, __a };
3650}
3651
3652/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3653/// with the specified single-precision floating-point values.
3654///
3655/// \headerfile <x86intrin.h>
3656///
3657/// This intrinsic is a utility function and does not correspond to a specific
3658/// instruction.
3659///
3660/// \param __a
3661/// A single-precision floating-point value used to initialize bits [255:224]
3662/// of the result.
3663/// \param __b
3664/// A single-precision floating-point value used to initialize bits [223:192]
3665/// of the result.
3666/// \param __c
3667/// A single-precision floating-point value used to initialize bits [191:160]
3668/// of the result.
3669/// \param __d
3670/// A single-precision floating-point value used to initialize bits [159:128]
3671/// of the result.
3672/// \param __e
3673/// A single-precision floating-point value used to initialize bits [127:96]
3674/// of the result.
3675/// \param __f
3676/// A single-precision floating-point value used to initialize bits [95:64]
3677/// of the result.
3678/// \param __g
3679/// A single-precision floating-point value used to initialize bits [63:32]
3680/// of the result.
3681/// \param __h
3682/// A single-precision floating-point value used to initialize bits [31:0]
3683/// of the result.
3684/// \returns An initialized 256-bit floating-point vector of [8 x float].
3685static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3686_mm256_set_ps(float __a, float __b, float __c, float __d,
3687 float __e, float __f, float __g, float __h)
3688{
3689 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3690}
3691
3692/// Constructs a 256-bit integer vector initialized with the specified
3693/// 32-bit integral values.
3694///
3695/// \headerfile <x86intrin.h>
3696///
3697/// This intrinsic is a utility function and does not correspond to a specific
3698/// instruction.
3699///
3700/// \param __i0
3701/// A 32-bit integral value used to initialize bits [255:224] of the result.
3702/// \param __i1
3703/// A 32-bit integral value used to initialize bits [223:192] of the result.
3704/// \param __i2
3705/// A 32-bit integral value used to initialize bits [191:160] of the result.
3706/// \param __i3
3707/// A 32-bit integral value used to initialize bits [159:128] of the result.
3708/// \param __i4
3709/// A 32-bit integral value used to initialize bits [127:96] of the result.
3710/// \param __i5
3711/// A 32-bit integral value used to initialize bits [95:64] of the result.
3712/// \param __i6
3713/// A 32-bit integral value used to initialize bits [63:32] of the result.
3714/// \param __i7
3715/// A 32-bit integral value used to initialize bits [31:0] of the result.
3716/// \returns An initialized 256-bit integer vector.
3717static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3718_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3719 int __i4, int __i5, int __i6, int __i7)
3720{
3721 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3722}
3723
3724/// Constructs a 256-bit integer vector initialized with the specified
3725/// 16-bit integral values.
3726///
3727/// \headerfile <x86intrin.h>
3728///
3729/// This intrinsic is a utility function and does not correspond to a specific
3730/// instruction.
3731///
3732/// \param __w15
3733/// A 16-bit integral value used to initialize bits [255:240] of the result.
3734/// \param __w14
3735/// A 16-bit integral value used to initialize bits [239:224] of the result.
3736/// \param __w13
3737/// A 16-bit integral value used to initialize bits [223:208] of the result.
3738/// \param __w12
3739/// A 16-bit integral value used to initialize bits [207:192] of the result.
3740/// \param __w11
3741/// A 16-bit integral value used to initialize bits [191:176] of the result.
3742/// \param __w10
3743/// A 16-bit integral value used to initialize bits [175:160] of the result.
3744/// \param __w09
3745/// A 16-bit integral value used to initialize bits [159:144] of the result.
3746/// \param __w08
3747/// A 16-bit integral value used to initialize bits [143:128] of the result.
3748/// \param __w07
3749/// A 16-bit integral value used to initialize bits [127:112] of the result.
3750/// \param __w06
3751/// A 16-bit integral value used to initialize bits [111:96] of the result.
3752/// \param __w05
3753/// A 16-bit integral value used to initialize bits [95:80] of the result.
3754/// \param __w04
3755/// A 16-bit integral value used to initialize bits [79:64] of the result.
3756/// \param __w03
3757/// A 16-bit integral value used to initialize bits [63:48] of the result.
3758/// \param __w02
3759/// A 16-bit integral value used to initialize bits [47:32] of the result.
3760/// \param __w01
3761/// A 16-bit integral value used to initialize bits [31:16] of the result.
3762/// \param __w00
3763/// A 16-bit integral value used to initialize bits [15:0] of the result.
3764/// \returns An initialized 256-bit integer vector.
3765static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3766_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3767 short __w11, short __w10, short __w09, short __w08,
3768 short __w07, short __w06, short __w05, short __w04,
3769 short __w03, short __w02, short __w01, short __w00)
3770{
3771 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3772 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3773}
3774
3775/// Constructs a 256-bit integer vector initialized with the specified
3776/// 8-bit integral values.
3777///
3778/// \headerfile <x86intrin.h>
3779///
3780/// This intrinsic is a utility function and does not correspond to a specific
3781/// instruction.
3782///
3783/// \param __b31
3784/// An 8-bit integral value used to initialize bits [255:248] of the result.
3785/// \param __b30
3786/// An 8-bit integral value used to initialize bits [247:240] of the result.
3787/// \param __b29
3788/// An 8-bit integral value used to initialize bits [239:232] of the result.
3789/// \param __b28
3790/// An 8-bit integral value used to initialize bits [231:224] of the result.
3791/// \param __b27
3792/// An 8-bit integral value used to initialize bits [223:216] of the result.
3793/// \param __b26
3794/// An 8-bit integral value used to initialize bits [215:208] of the result.
3795/// \param __b25
3796/// An 8-bit integral value used to initialize bits [207:200] of the result.
3797/// \param __b24
3798/// An 8-bit integral value used to initialize bits [199:192] of the result.
3799/// \param __b23
3800/// An 8-bit integral value used to initialize bits [191:184] of the result.
3801/// \param __b22
3802/// An 8-bit integral value used to initialize bits [183:176] of the result.
3803/// \param __b21
3804/// An 8-bit integral value used to initialize bits [175:168] of the result.
3805/// \param __b20
3806/// An 8-bit integral value used to initialize bits [167:160] of the result.
3807/// \param __b19
3808/// An 8-bit integral value used to initialize bits [159:152] of the result.
3809/// \param __b18
3810/// An 8-bit integral value used to initialize bits [151:144] of the result.
3811/// \param __b17
3812/// An 8-bit integral value used to initialize bits [143:136] of the result.
3813/// \param __b16
3814/// An 8-bit integral value used to initialize bits [135:128] of the result.
3815/// \param __b15
3816/// An 8-bit integral value used to initialize bits [127:120] of the result.
3817/// \param __b14
3818/// An 8-bit integral value used to initialize bits [119:112] of the result.
3819/// \param __b13
3820/// An 8-bit integral value used to initialize bits [111:104] of the result.
3821/// \param __b12
3822/// An 8-bit integral value used to initialize bits [103:96] of the result.
3823/// \param __b11
3824/// An 8-bit integral value used to initialize bits [95:88] of the result.
3825/// \param __b10
3826/// An 8-bit integral value used to initialize bits [87:80] of the result.
3827/// \param __b09
3828/// An 8-bit integral value used to initialize bits [79:72] of the result.
3829/// \param __b08
3830/// An 8-bit integral value used to initialize bits [71:64] of the result.
3831/// \param __b07
3832/// An 8-bit integral value used to initialize bits [63:56] of the result.
3833/// \param __b06
3834/// An 8-bit integral value used to initialize bits [55:48] of the result.
3835/// \param __b05
3836/// An 8-bit integral value used to initialize bits [47:40] of the result.
3837/// \param __b04
3838/// An 8-bit integral value used to initialize bits [39:32] of the result.
3839/// \param __b03
3840/// An 8-bit integral value used to initialize bits [31:24] of the result.
3841/// \param __b02
3842/// An 8-bit integral value used to initialize bits [23:16] of the result.
3843/// \param __b01
3844/// An 8-bit integral value used to initialize bits [15:8] of the result.
3845/// \param __b00
3846/// An 8-bit integral value used to initialize bits [7:0] of the result.
3847/// \returns An initialized 256-bit integer vector.
3848static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3849_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3850 char __b27, char __b26, char __b25, char __b24,
3851 char __b23, char __b22, char __b21, char __b20,
3852 char __b19, char __b18, char __b17, char __b16,
3853 char __b15, char __b14, char __b13, char __b12,
3854 char __b11, char __b10, char __b09, char __b08,
3855 char __b07, char __b06, char __b05, char __b04,
3856 char __b03, char __b02, char __b01, char __b00)
3857{
3858 return __extension__ (__m256i)(__v32qi){
3859 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3860 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3861 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3862 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3863 };
3864}
3865
3866/// Constructs a 256-bit integer vector initialized with the specified
3867/// 64-bit integral values.
3868///
3869/// \headerfile <x86intrin.h>
3870///
3871/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3872/// instruction.
3873///
3874/// \param __a
3875/// A 64-bit integral value used to initialize bits [255:192] of the result.
3876/// \param __b
3877/// A 64-bit integral value used to initialize bits [191:128] of the result.
3878/// \param __c
3879/// A 64-bit integral value used to initialize bits [127:64] of the result.
3880/// \param __d
3881/// A 64-bit integral value used to initialize bits [63:0] of the result.
3882/// \returns An initialized 256-bit integer vector.
3883static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3884_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3885{
3886 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3887}
3888
3889/* Create vectors with elements in reverse order */
3890/// Constructs a 256-bit floating-point vector of [4 x double],
3891/// initialized in reverse order with the specified double-precision
3892/// floating-point values.
3893///
3894/// \headerfile <x86intrin.h>
3895///
3896/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3897/// instruction.
3898///
3899/// \param __a
3900/// A double-precision floating-point value used to initialize bits [63:0]
3901/// of the result.
3902/// \param __b
3903/// A double-precision floating-point value used to initialize bits [127:64]
3904/// of the result.
3905/// \param __c
3906/// A double-precision floating-point value used to initialize bits [191:128]
3907/// of the result.
3908/// \param __d
3909/// A double-precision floating-point value used to initialize bits [255:192]
3910/// of the result.
3911/// \returns An initialized 256-bit floating-point vector of [4 x double].
3912static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3913_mm256_setr_pd(double __a, double __b, double __c, double __d)
3914{
3915 return _mm256_set_pd(__d, __c, __b, __a);
3916}
3917
3918/// Constructs a 256-bit floating-point vector of [8 x float],
3919/// initialized in reverse order with the specified single-precision
3920/// float-point values.
3921///
3922/// \headerfile <x86intrin.h>
3923///
3924/// This intrinsic is a utility function and does not correspond to a specific
3925/// instruction.
3926///
3927/// \param __a
3928/// A single-precision floating-point value used to initialize bits [31:0]
3929/// of the result.
3930/// \param __b
3931/// A single-precision floating-point value used to initialize bits [63:32]
3932/// of the result.
3933/// \param __c
3934/// A single-precision floating-point value used to initialize bits [95:64]
3935/// of the result.
3936/// \param __d
3937/// A single-precision floating-point value used to initialize bits [127:96]
3938/// of the result.
3939/// \param __e
3940/// A single-precision floating-point value used to initialize bits [159:128]
3941/// of the result.
3942/// \param __f
3943/// A single-precision floating-point value used to initialize bits [191:160]
3944/// of the result.
3945/// \param __g
3946/// A single-precision floating-point value used to initialize bits [223:192]
3947/// of the result.
3948/// \param __h
3949/// A single-precision floating-point value used to initialize bits [255:224]
3950/// of the result.
3951/// \returns An initialized 256-bit floating-point vector of [8 x float].
3952static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3953_mm256_setr_ps(float __a, float __b, float __c, float __d,
3954 float __e, float __f, float __g, float __h)
3955{
3956 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3957}
3958
3959/// Constructs a 256-bit integer vector, initialized in reverse order
3960/// with the specified 32-bit integral values.
3961///
3962/// \headerfile <x86intrin.h>
3963///
3964/// This intrinsic is a utility function and does not correspond to a specific
3965/// instruction.
3966///
3967/// \param __i0
3968/// A 32-bit integral value used to initialize bits [31:0] of the result.
3969/// \param __i1
3970/// A 32-bit integral value used to initialize bits [63:32] of the result.
3971/// \param __i2
3972/// A 32-bit integral value used to initialize bits [95:64] of the result.
3973/// \param __i3
3974/// A 32-bit integral value used to initialize bits [127:96] of the result.
3975/// \param __i4
3976/// A 32-bit integral value used to initialize bits [159:128] of the result.
3977/// \param __i5
3978/// A 32-bit integral value used to initialize bits [191:160] of the result.
3979/// \param __i6
3980/// A 32-bit integral value used to initialize bits [223:192] of the result.
3981/// \param __i7
3982/// A 32-bit integral value used to initialize bits [255:224] of the result.
3983/// \returns An initialized 256-bit integer vector.
3984static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
3985_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3986 int __i4, int __i5, int __i6, int __i7)
3987{
3988 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
3989}
3990
3991/// Constructs a 256-bit integer vector, initialized in reverse order
3992/// with the specified 16-bit integral values.
3993///
3994/// \headerfile <x86intrin.h>
3995///
3996/// This intrinsic is a utility function and does not correspond to a specific
3997/// instruction.
3998///
3999/// \param __w15
4000/// A 16-bit integral value used to initialize bits [15:0] of the result.
4001/// \param __w14
4002/// A 16-bit integral value used to initialize bits [31:16] of the result.
4003/// \param __w13
4004/// A 16-bit integral value used to initialize bits [47:32] of the result.
4005/// \param __w12
4006/// A 16-bit integral value used to initialize bits [63:48] of the result.
4007/// \param __w11
4008/// A 16-bit integral value used to initialize bits [79:64] of the result.
4009/// \param __w10
4010/// A 16-bit integral value used to initialize bits [95:80] of the result.
4011/// \param __w09
4012/// A 16-bit integral value used to initialize bits [111:96] of the result.
4013/// \param __w08
4014/// A 16-bit integral value used to initialize bits [127:112] of the result.
4015/// \param __w07
4016/// A 16-bit integral value used to initialize bits [143:128] of the result.
4017/// \param __w06
4018/// A 16-bit integral value used to initialize bits [159:144] of the result.
4019/// \param __w05
4020/// A 16-bit integral value used to initialize bits [175:160] of the result.
4021/// \param __w04
4022/// A 16-bit integral value used to initialize bits [191:176] of the result.
4023/// \param __w03
4024/// A 16-bit integral value used to initialize bits [207:192] of the result.
4025/// \param __w02
4026/// A 16-bit integral value used to initialize bits [223:208] of the result.
4027/// \param __w01
4028/// A 16-bit integral value used to initialize bits [239:224] of the result.
4029/// \param __w00
4030/// A 16-bit integral value used to initialize bits [255:240] of the result.
4031/// \returns An initialized 256-bit integer vector.
4032static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4033_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4034 short __w11, short __w10, short __w09, short __w08,
4035 short __w07, short __w06, short __w05, short __w04,
4036 short __w03, short __w02, short __w01, short __w00)
4037{
4038 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4039 __w04, __w05, __w06, __w07,
4040 __w08, __w09, __w10, __w11,
4041 __w12, __w13, __w14, __w15);
4042}
4043
4044/// Constructs a 256-bit integer vector, initialized in reverse order
4045/// with the specified 8-bit integral values.
4046///
4047/// \headerfile <x86intrin.h>
4048///
4049/// This intrinsic is a utility function and does not correspond to a specific
4050/// instruction.
4051///
4052/// \param __b31
4053/// An 8-bit integral value used to initialize bits [7:0] of the result.
4054/// \param __b30
4055/// An 8-bit integral value used to initialize bits [15:8] of the result.
4056/// \param __b29
4057/// An 8-bit integral value used to initialize bits [23:16] of the result.
4058/// \param __b28
4059/// An 8-bit integral value used to initialize bits [31:24] of the result.
4060/// \param __b27
4061/// An 8-bit integral value used to initialize bits [39:32] of the result.
4062/// \param __b26
4063/// An 8-bit integral value used to initialize bits [47:40] of the result.
4064/// \param __b25
4065/// An 8-bit integral value used to initialize bits [55:48] of the result.
4066/// \param __b24
4067/// An 8-bit integral value used to initialize bits [63:56] of the result.
4068/// \param __b23
4069/// An 8-bit integral value used to initialize bits [71:64] of the result.
4070/// \param __b22
4071/// An 8-bit integral value used to initialize bits [79:72] of the result.
4072/// \param __b21
4073/// An 8-bit integral value used to initialize bits [87:80] of the result.
4074/// \param __b20
4075/// An 8-bit integral value used to initialize bits [95:88] of the result.
4076/// \param __b19
4077/// An 8-bit integral value used to initialize bits [103:96] of the result.
4078/// \param __b18
4079/// An 8-bit integral value used to initialize bits [111:104] of the result.
4080/// \param __b17
4081/// An 8-bit integral value used to initialize bits [119:112] of the result.
4082/// \param __b16
4083/// An 8-bit integral value used to initialize bits [127:120] of the result.
4084/// \param __b15
4085/// An 8-bit integral value used to initialize bits [135:128] of the result.
4086/// \param __b14
4087/// An 8-bit integral value used to initialize bits [143:136] of the result.
4088/// \param __b13
4089/// An 8-bit integral value used to initialize bits [151:144] of the result.
4090/// \param __b12
4091/// An 8-bit integral value used to initialize bits [159:152] of the result.
4092/// \param __b11
4093/// An 8-bit integral value used to initialize bits [167:160] of the result.
4094/// \param __b10
4095/// An 8-bit integral value used to initialize bits [175:168] of the result.
4096/// \param __b09
4097/// An 8-bit integral value used to initialize bits [183:176] of the result.
4098/// \param __b08
4099/// An 8-bit integral value used to initialize bits [191:184] of the result.
4100/// \param __b07
4101/// An 8-bit integral value used to initialize bits [199:192] of the result.
4102/// \param __b06
4103/// An 8-bit integral value used to initialize bits [207:200] of the result.
4104/// \param __b05
4105/// An 8-bit integral value used to initialize bits [215:208] of the result.
4106/// \param __b04
4107/// An 8-bit integral value used to initialize bits [223:216] of the result.
4108/// \param __b03
4109/// An 8-bit integral value used to initialize bits [231:224] of the result.
4110/// \param __b02
4111/// An 8-bit integral value used to initialize bits [239:232] of the result.
4112/// \param __b01
4113/// An 8-bit integral value used to initialize bits [247:240] of the result.
4114/// \param __b00
4115/// An 8-bit integral value used to initialize bits [255:248] of the result.
4116/// \returns An initialized 256-bit integer vector.
4117static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4118_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4119 char __b27, char __b26, char __b25, char __b24,
4120 char __b23, char __b22, char __b21, char __b20,
4121 char __b19, char __b18, char __b17, char __b16,
4122 char __b15, char __b14, char __b13, char __b12,
4123 char __b11, char __b10, char __b09, char __b08,
4124 char __b07, char __b06, char __b05, char __b04,
4125 char __b03, char __b02, char __b01, char __b00)
4126{
4127 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4128 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4129 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4130 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4131}
4132
4133/// Constructs a 256-bit integer vector, initialized in reverse order
4134/// with the specified 64-bit integral values.
4135///
4136/// \headerfile <x86intrin.h>
4137///
4138/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4139/// instruction.
4140///
4141/// \param __a
4142/// A 64-bit integral value used to initialize bits [63:0] of the result.
4143/// \param __b
4144/// A 64-bit integral value used to initialize bits [127:64] of the result.
4145/// \param __c
4146/// A 64-bit integral value used to initialize bits [191:128] of the result.
4147/// \param __d
4148/// A 64-bit integral value used to initialize bits [255:192] of the result.
4149/// \returns An initialized 256-bit integer vector.
4150static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4151_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4152{
4153 return _mm256_set_epi64x(__d, __c, __b, __a);
4154}
4155
4156/* Create vectors with repeated elements */
4157/// Constructs a 256-bit floating-point vector of [4 x double], with each
4158/// of the four double-precision floating-point vector elements set to the
4159/// specified double-precision floating-point value.
4160///
4161/// \headerfile <x86intrin.h>
4162///
4163/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4164///
4165/// \param __w
4166/// A double-precision floating-point value used to initialize each vector
4167/// element of the result.
4168/// \returns An initialized 256-bit floating-point vector of [4 x double].
4169static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4171{
4172 return _mm256_set_pd(__w, __w, __w, __w);
4173}
4174
4175/// Constructs a 256-bit floating-point vector of [8 x float], with each
4176/// of the eight single-precision floating-point vector elements set to the
4177/// specified single-precision floating-point value.
4178///
4179/// \headerfile <x86intrin.h>
4180///
4181/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4182/// instruction.
4183///
4184/// \param __w
4185/// A single-precision floating-point value used to initialize each vector
4186/// element of the result.
4187/// \returns An initialized 256-bit floating-point vector of [8 x float].
4188static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4190{
4191 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4192}
4193
4194/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4195/// 32-bit integral vector elements set to the specified 32-bit integral
4196/// value.
4197///
4198/// \headerfile <x86intrin.h>
4199///
4200/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4201/// instruction.
4202///
4203/// \param __i
4204/// A 32-bit integral value used to initialize each vector element of the
4205/// result.
4206/// \returns An initialized 256-bit integer vector of [8 x i32].
4207static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4209{
4210 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4211}
4212
4213/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4214/// 16-bit integral vector elements set to the specified 16-bit integral
4215/// value.
4216///
4217/// \headerfile <x86intrin.h>
4218///
4219/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4220///
4221/// \param __w
4222/// A 16-bit integral value used to initialize each vector element of the
4223/// result.
4224/// \returns An initialized 256-bit integer vector of [16 x i16].
4225static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4227{
4228 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4229 __w, __w, __w, __w, __w, __w, __w, __w);
4230}
4231
4232/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4233/// 8-bit integral vector elements set to the specified 8-bit integral value.
4234///
4235/// \headerfile <x86intrin.h>
4236///
4237/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4238///
4239/// \param __b
4240/// An 8-bit integral value used to initialize each vector element of the
4241/// result.
4242/// \returns An initialized 256-bit integer vector of [32 x i8].
4243static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4245{
4246 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4247 __b, __b, __b, __b, __b, __b, __b, __b,
4248 __b, __b, __b, __b, __b, __b, __b, __b,
4249 __b, __b, __b, __b, __b, __b, __b, __b);
4250}
4251
4252/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4253/// 64-bit integral vector elements set to the specified 64-bit integral
4254/// value.
4255///
4256/// \headerfile <x86intrin.h>
4257///
4258/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4259///
4260/// \param __q
4261/// A 64-bit integral value used to initialize each vector element of the
4262/// result.
4263/// \returns An initialized 256-bit integer vector of [4 x i64].
4264static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4266{
4267 return _mm256_set_epi64x(__q, __q, __q, __q);
4268}
4269
4270/* Create __zeroed vectors */
4271/// Constructs a 256-bit floating-point vector of [4 x double] with all
4272/// vector elements initialized to zero.
4273///
4274/// \headerfile <x86intrin.h>
4275///
4276/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4277///
4278/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4280 return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
4281}
4282
4283/// Constructs a 256-bit floating-point vector of [8 x float] with all
4284/// vector elements initialized to zero.
4285///
4286/// \headerfile <x86intrin.h>
4287///
4288/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4289///
4290/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4292 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4293}
4294
4295/// Constructs a 256-bit integer vector initialized to zero.
4296///
4297/// \headerfile <x86intrin.h>
4298///
4299/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4300///
4301/// \returns A 256-bit integer vector initialized to zero.
4302static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4304 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4305}
4306
4307/* Cast between vector types */
4308/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4309/// floating-point vector of [8 x float].
4310///
4311/// \headerfile <x86intrin.h>
4312///
4313/// This intrinsic has no corresponding instruction.
4314///
4315/// \param __a
4316/// A 256-bit floating-point vector of [4 x double].
4317/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4318/// bitwise pattern as the parameter.
4319static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4321{
4322 return (__m256)__a;
4323}
4324
4325/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4326/// integer vector.
4327///
4328/// \headerfile <x86intrin.h>
4329///
4330/// This intrinsic has no corresponding instruction.
4331///
4332/// \param __a
4333/// A 256-bit floating-point vector of [4 x double].
4334/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4335/// parameter.
4336static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4338{
4339 return (__m256i)__a;
4340}
4341
4342/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4343/// floating-point vector of [4 x double].
4344///
4345/// \headerfile <x86intrin.h>
4346///
4347/// This intrinsic has no corresponding instruction.
4348///
4349/// \param __a
4350/// A 256-bit floating-point vector of [8 x float].
4351/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4352/// bitwise pattern as the parameter.
4353static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4355{
4356 return (__m256d)__a;
4357}
4358
4359/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4360/// integer vector.
4361///
4362/// \headerfile <x86intrin.h>
4363///
4364/// This intrinsic has no corresponding instruction.
4365///
4366/// \param __a
4367/// A 256-bit floating-point vector of [8 x float].
4368/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4369/// parameter.
4370static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4372{
4373 return (__m256i)__a;
4374}
4375
4376/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4377/// of [8 x float].
4378///
4379/// \headerfile <x86intrin.h>
4380///
4381/// This intrinsic has no corresponding instruction.
4382///
4383/// \param __a
4384/// A 256-bit integer vector.
4385/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4386/// bitwise pattern as the parameter.
4387static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4389{
4390 return (__m256)__a;
4391}
4392
4393/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4394/// of [4 x double].
4395///
4396/// \headerfile <x86intrin.h>
4397///
4398/// This intrinsic has no corresponding instruction.
4399///
4400/// \param __a
4401/// A 256-bit integer vector.
4402/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4403/// bitwise pattern as the parameter.
4404static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4406{
4407 return (__m256d)__a;
4408}
4409
4410/// Returns the lower 128 bits of a 256-bit floating-point vector of
4411/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4412///
4413/// \headerfile <x86intrin.h>
4414///
4415/// This intrinsic has no corresponding instruction.
4416///
4417/// \param __a
4418/// A 256-bit floating-point vector of [4 x double].
4419/// \returns A 128-bit floating-point vector of [2 x double] containing the
4420/// lower 128 bits of the parameter.
4421static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4423{
4424 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4425}
4426
4427/// Returns the lower 128 bits of a 256-bit floating-point vector of
4428/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4429///
4430/// \headerfile <x86intrin.h>
4431///
4432/// This intrinsic has no corresponding instruction.
4433///
4434/// \param __a
4435/// A 256-bit floating-point vector of [8 x float].
4436/// \returns A 128-bit floating-point vector of [4 x float] containing the
4437/// lower 128 bits of the parameter.
4438static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4440{
4441 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4442}
4443
4444/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4445///
4446/// \headerfile <x86intrin.h>
4447///
4448/// This intrinsic has no corresponding instruction.
4449///
4450/// \param __a
4451/// A 256-bit integer vector.
4452/// \returns A 128-bit integer vector containing the lower 128 bits of the
4453/// parameter.
4454static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4456{
4457 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4458}
4459
4460/// Constructs a 256-bit floating-point vector of [4 x double] from a
4461/// 128-bit floating-point vector of [2 x double].
4462///
4463/// The lower 128 bits contain the value of the source vector. The contents
4464/// of the upper 128 bits are undefined.
4465///
4466/// \headerfile <x86intrin.h>
4467///
4468/// This intrinsic has no corresponding instruction.
4469///
4470/// \param __a
4471/// A 128-bit vector of [2 x double].
4472/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4473/// contain the value of the parameter. The contents of the upper 128 bits
4474/// are undefined.
4475static __inline __m256d __DEFAULT_FN_ATTRS
4477{
4478 return __builtin_shufflevector(
4479 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4480}
4481
4482/// Constructs a 256-bit floating-point vector of [8 x float] from a
4483/// 128-bit floating-point vector of [4 x float].
4484///
4485/// The lower 128 bits contain the value of the source vector. The contents
4486/// of the upper 128 bits are undefined.
4487///
4488/// \headerfile <x86intrin.h>
4489///
4490/// This intrinsic has no corresponding instruction.
4491///
4492/// \param __a
4493/// A 128-bit vector of [4 x float].
4494/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4495/// contain the value of the parameter. The contents of the upper 128 bits
4496/// are undefined.
4497static __inline __m256 __DEFAULT_FN_ATTRS
4499{
4500 return __builtin_shufflevector((__v4sf)__a,
4501 (__v4sf)__builtin_nondeterministic_value(__a),
4502 0, 1, 2, 3, 4, 5, 6, 7);
4503}
4504
4505/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4506///
4507/// The lower 128 bits contain the value of the source vector. The contents
4508/// of the upper 128 bits are undefined.
4509///
4510/// \headerfile <x86intrin.h>
4511///
4512/// This intrinsic has no corresponding instruction.
4513///
4514/// \param __a
4515/// A 128-bit integer vector.
4516/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4517/// the parameter. The contents of the upper 128 bits are undefined.
4518static __inline __m256i __DEFAULT_FN_ATTRS
4520{
4521 return __builtin_shufflevector(
4522 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4523}
4524
4525/// Constructs a 256-bit floating-point vector of [4 x double] from a
4526/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4527/// contain the value of the source vector. The upper 128 bits are set
4528/// to zero.
4529///
4530/// \headerfile <x86intrin.h>
4531///
4532/// This intrinsic has no corresponding instruction.
4533///
4534/// \param __a
4535/// A 128-bit vector of [2 x double].
4536/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4537/// contain the value of the parameter. The upper 128 bits are set to zero.
4538static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4540 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4541}
4542
4543/// Constructs a 256-bit floating-point vector of [8 x float] from a
4544/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4545/// the value of the source vector. The upper 128 bits are set to zero.
4546///
4547/// \headerfile <x86intrin.h>
4548///
4549/// This intrinsic has no corresponding instruction.
4550///
4551/// \param __a
4552/// A 128-bit vector of [4 x float].
4553/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4554/// contain the value of the parameter. The upper 128 bits are set to zero.
4555static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4557 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4558}
4559
4560/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4561/// The lower 128 bits contain the value of the source vector. The upper
4562/// 128 bits are set to zero.
4563///
4564/// \headerfile <x86intrin.h>
4565///
4566/// This intrinsic has no corresponding instruction.
4567///
4568/// \param __a
4569/// A 128-bit integer vector.
4570/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4571/// the parameter. The upper 128 bits are set to zero.
4572static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4574 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4575}
4576
4577/*
4578 Vector insert.
4579 We use macros rather than inlines because we only want to accept
4580 invocations where the immediate M is a constant expression.
4581*/
4582/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4583/// a 256-bit vector of [8 x float] given in the first parameter, and then
4584/// replacing either the upper or the lower 128 bits with the contents of a
4585/// 128-bit vector of [4 x float] in the second parameter.
4586///
4587/// The immediate integer parameter determines between the upper or the lower
4588/// 128 bits.
4589///
4590/// \headerfile <x86intrin.h>
4591///
4592/// \code
4593/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4594/// \endcode
4595///
4596/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4597///
4598/// \param V1
4599/// A 256-bit vector of [8 x float]. This vector is copied to the result
4600/// first, and then either the upper or the lower 128 bits of the result will
4601/// be replaced by the contents of \a V2.
4602/// \param V2
4603/// A 128-bit vector of [4 x float]. The contents of this parameter are
4604/// written to either the upper or the lower 128 bits of the result depending
4605/// on the value of parameter \a M.
4606/// \param M
4607/// An immediate integer. The least significant bit determines how the values
4608/// from the two parameters are interleaved: \n
4609/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4610/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4611/// result. \n
4612/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4613/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4614/// result.
4615/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4616#define _mm256_insertf128_ps(V1, V2, M) \
4617 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4618 (__v4sf)(__m128)(V2), (int)(M)))
4619
4620/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4621/// a 256-bit vector of [4 x double] given in the first parameter, and then
4622/// replacing either the upper or the lower 128 bits with the contents of a
4623/// 128-bit vector of [2 x double] in the second parameter.
4624///
4625/// The immediate integer parameter determines between the upper or the lower
4626/// 128 bits.
4627///
4628/// \headerfile <x86intrin.h>
4629///
4630/// \code
4631/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4632/// \endcode
4633///
4634/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4635///
4636/// \param V1
4637/// A 256-bit vector of [4 x double]. This vector is copied to the result
4638/// first, and then either the upper or the lower 128 bits of the result will
4639/// be replaced by the contents of \a V2.
4640/// \param V2
4641/// A 128-bit vector of [2 x double]. The contents of this parameter are
4642/// written to either the upper or the lower 128 bits of the result depending
4643/// on the value of parameter \a M.
4644/// \param M
4645/// An immediate integer. The least significant bit determines how the values
4646/// from the two parameters are interleaved: \n
4647/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4648/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4649/// result. \n
4650/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4651/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4652/// result.
4653/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4654#define _mm256_insertf128_pd(V1, V2, M) \
4655 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4656 (__v2df)(__m128d)(V2), (int)(M)))
4657
4658/// Constructs a new 256-bit integer vector by first duplicating a
4659/// 256-bit integer vector given in the first parameter, and then replacing
4660/// either the upper or the lower 128 bits with the contents of a 128-bit
4661/// integer vector in the second parameter.
4662///
4663/// The immediate integer parameter determines between the upper or the lower
4664/// 128 bits.
4665///
4666/// \headerfile <x86intrin.h>
4667///
4668/// \code
4669/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4670/// \endcode
4671///
4672/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4673///
4674/// \param V1
4675/// A 256-bit integer vector. This vector is copied to the result first, and
4676/// then either the upper or the lower 128 bits of the result will be
4677/// replaced by the contents of \a V2.
4678/// \param V2
4679/// A 128-bit integer vector. The contents of this parameter are written to
4680/// either the upper or the lower 128 bits of the result depending on the
4681/// value of parameter \a M.
4682/// \param M
4683/// An immediate integer. The least significant bit determines how the values
4684/// from the two parameters are interleaved: \n
4685/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4686/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4687/// result. \n
4688/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4689/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4690/// result.
4691/// \returns A 256-bit integer vector containing the interleaved values.
4692#define _mm256_insertf128_si256(V1, V2, M) \
4693 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4694 (__v4si)(__m128i)(V2), (int)(M)))
4695
4696/*
4697 Vector extract.
4698 We use macros rather than inlines because we only want to accept
4699 invocations where the immediate M is a constant expression.
4700*/
4701/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4702/// of [8 x float], as determined by the immediate integer parameter, and
4703/// returns the extracted bits as a 128-bit vector of [4 x float].
4704///
4705/// \headerfile <x86intrin.h>
4706///
4707/// \code
4708/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4709/// \endcode
4710///
4711/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4712///
4713/// \param V
4714/// A 256-bit vector of [8 x float].
4715/// \param M
4716/// An immediate integer. The least significant bit determines which bits are
4717/// extracted from the first parameter: \n
4718/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4719/// result. \n
4720/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4721/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4722#define _mm256_extractf128_ps(V, M) \
4723 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4724
4725/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4726/// of [4 x double], as determined by the immediate integer parameter, and
4727/// returns the extracted bits as a 128-bit vector of [2 x double].
4728///
4729/// \headerfile <x86intrin.h>
4730///
4731/// \code
4732/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4733/// \endcode
4734///
4735/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4736///
4737/// \param V
4738/// A 256-bit vector of [4 x double].
4739/// \param M
4740/// An immediate integer. The least significant bit determines which bits are
4741/// extracted from the first parameter: \n
4742/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4743/// result. \n
4744/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4745/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4746#define _mm256_extractf128_pd(V, M) \
4747 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4748
4749/// Extracts either the upper or the lower 128 bits from a 256-bit
4750/// integer vector, as determined by the immediate integer parameter, and
4751/// returns the extracted bits as a 128-bit integer vector.
4752///
4753/// \headerfile <x86intrin.h>
4754///
4755/// \code
4756/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4757/// \endcode
4758///
4759/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4760///
4761/// \param V
4762/// A 256-bit integer vector.
4763/// \param M
4764/// An immediate integer. The least significant bit determines which bits are
4765/// extracted from the first parameter: \n
4766/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4767/// result. \n
4768/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4769/// \returns A 128-bit integer vector containing the extracted bits.
4770#define _mm256_extractf128_si256(V, M) \
4771 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4772
4773/// Constructs a 256-bit floating-point vector of [8 x float] by
4774/// concatenating two 128-bit floating-point vectors of [4 x float].
4775///
4776/// \headerfile <x86intrin.h>
4777///
4778/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4779///
4780/// \param __hi
4781/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4782/// 128 bits of the result.
4783/// \param __lo
4784/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4785/// 128 bits of the result.
4786/// \returns A 256-bit floating-point vector of [8 x float] containing the
4787/// concatenated result.
4788static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4789_mm256_set_m128(__m128 __hi, __m128 __lo) {
4790 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4791}
4792
4793/// Constructs a 256-bit floating-point vector of [4 x double] by
4794/// concatenating two 128-bit floating-point vectors of [2 x double].
4795///
4796/// \headerfile <x86intrin.h>
4797///
4798/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4799///
4800/// \param __hi
4801/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4802/// 128 bits of the result.
4803/// \param __lo
4804/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4805/// 128 bits of the result.
4806/// \returns A 256-bit floating-point vector of [4 x double] containing the
4807/// concatenated result.
4808static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4809_mm256_set_m128d(__m128d __hi, __m128d __lo) {
4810 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4811}
4812
4813/// Constructs a 256-bit integer vector by concatenating two 128-bit
4814/// integer vectors.
4815///
4816/// \headerfile <x86intrin.h>
4817///
4818/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4819///
4820/// \param __hi
4821/// A 128-bit integer vector to be copied to the upper 128 bits of the
4822/// result.
4823/// \param __lo
4824/// A 128-bit integer vector to be copied to the lower 128 bits of the
4825/// result.
4826/// \returns A 256-bit integer vector containing the concatenated result.
4827static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4828_mm256_set_m128i(__m128i __hi, __m128i __lo) {
4829 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4830}
4831
4832/// Constructs a 256-bit floating-point vector of [8 x float] by
4833/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4834/// similar to _mm256_set_m128, but the order of the input parameters is
4835/// swapped.
4836///
4837/// \headerfile <x86intrin.h>
4838///
4839/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4840///
4841/// \param __lo
4842/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4843/// 128 bits of the result.
4844/// \param __hi
4845/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4846/// 128 bits of the result.
4847/// \returns A 256-bit floating-point vector of [8 x float] containing the
4848/// concatenated result.
4849static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4850_mm256_setr_m128(__m128 __lo, __m128 __hi) {
4851 return _mm256_set_m128(__hi, __lo);
4852}
4853
4854/// Constructs a 256-bit floating-point vector of [4 x double] by
4855/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4856/// similar to _mm256_set_m128d, but the order of the input parameters is
4857/// swapped.
4858///
4859/// \headerfile <x86intrin.h>
4860///
4861/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4862///
4863/// \param __lo
4864/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4865/// 128 bits of the result.
4866/// \param __hi
4867/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4868/// 128 bits of the result.
4869/// \returns A 256-bit floating-point vector of [4 x double] containing the
4870/// concatenated result.
4871static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4872_mm256_setr_m128d(__m128d __lo, __m128d __hi) {
4873 return (__m256d)_mm256_set_m128d(__hi, __lo);
4874}
4875
4876/// Constructs a 256-bit integer vector by concatenating two 128-bit
4877/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4878/// the input parameters is swapped.
4879///
4880/// \headerfile <x86intrin.h>
4881///
4882/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4883///
4884/// \param __lo
4885/// A 128-bit integer vector to be copied to the lower 128 bits of the
4886/// result.
4887/// \param __hi
4888/// A 128-bit integer vector to be copied to the upper 128 bits of the
4889/// result.
4890/// \returns A 256-bit integer vector containing the concatenated result.
4891static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4892_mm256_setr_m128i(__m128i __lo, __m128i __hi) {
4893 return (__m256i)_mm256_set_m128i(__hi, __lo);
4894}
4895
4896/* SIMD load ops (unaligned) */
4897/// Loads two 128-bit floating-point vectors of [4 x float] from
4898/// unaligned memory locations and constructs a 256-bit floating-point vector
4899/// of [8 x float] by concatenating the two 128-bit vectors.
4900///
4901/// \headerfile <x86intrin.h>
4902///
4903/// This intrinsic corresponds to load instructions followed by the
4904/// <c> VINSERTF128 </c> instruction.
4905///
4906/// \param __addr_hi
4907/// A pointer to a 128-bit memory location containing 4 consecutive
4908/// single-precision floating-point values. These values are to be copied to
4909/// bits[255:128] of the result. The address of the memory location does not
4910/// have to be aligned.
4911/// \param __addr_lo
4912/// A pointer to a 128-bit memory location containing 4 consecutive
4913/// single-precision floating-point values. These values are to be copied to
4914/// bits[127:0] of the result. The address of the memory location does not
4915/// have to be aligned.
4916/// \returns A 256-bit floating-point vector of [8 x float] containing the
4917/// concatenated result.
4918static __inline __m256 __DEFAULT_FN_ATTRS
4919_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4920{
4921 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4922}
4923
4924/// Loads two 128-bit floating-point vectors of [2 x double] from
4925/// unaligned memory locations and constructs a 256-bit floating-point vector
4926/// of [4 x double] by concatenating the two 128-bit vectors.
4927///
4928/// \headerfile <x86intrin.h>
4929///
4930/// This intrinsic corresponds to load instructions followed by the
4931/// <c> VINSERTF128 </c> instruction.
4932///
4933/// \param __addr_hi
4934/// A pointer to a 128-bit memory location containing two consecutive
4935/// double-precision floating-point values. These values are to be copied to
4936/// bits[255:128] of the result. The address of the memory location does not
4937/// have to be aligned.
4938/// \param __addr_lo
4939/// A pointer to a 128-bit memory location containing two consecutive
4940/// double-precision floating-point values. These values are to be copied to
4941/// bits[127:0] of the result. The address of the memory location does not
4942/// have to be aligned.
4943/// \returns A 256-bit floating-point vector of [4 x double] containing the
4944/// concatenated result.
4945static __inline __m256d __DEFAULT_FN_ATTRS
4946_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4947{
4948 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
4949}
4950
4951/// Loads two 128-bit integer vectors from unaligned memory locations and
4952/// constructs a 256-bit integer vector by concatenating the two 128-bit
4953/// vectors.
4954///
4955/// \headerfile <x86intrin.h>
4956///
4957/// This intrinsic corresponds to load instructions followed by the
4958/// <c> VINSERTF128 </c> instruction.
4959///
4960/// \param __addr_hi
4961/// A pointer to a 128-bit memory location containing a 128-bit integer
4962/// vector. This vector is to be copied to bits[255:128] of the result. The
4963/// address of the memory location does not have to be aligned.
4964/// \param __addr_lo
4965/// A pointer to a 128-bit memory location containing a 128-bit integer
4966/// vector. This vector is to be copied to bits[127:0] of the result. The
4967/// address of the memory location does not have to be aligned.
4968/// \returns A 256-bit integer vector containing the concatenated result.
4969static __inline __m256i __DEFAULT_FN_ATTRS
4970_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
4971{
4972 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
4973}
4974
4975/* SIMD store ops (unaligned) */
4976/// Stores the upper and lower 128 bits of a 256-bit floating-point
4977/// vector of [8 x float] into two different unaligned memory locations.
4978///
4979/// \headerfile <x86intrin.h>
4980///
4981/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4982/// store instructions.
4983///
4984/// \param __addr_hi
4985/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4986/// copied to this memory location. The address of this memory location does
4987/// not have to be aligned.
4988/// \param __addr_lo
4989/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4990/// copied to this memory location. The address of this memory location does
4991/// not have to be aligned.
4992/// \param __a
4993/// A 256-bit floating-point vector of [8 x float].
4994static __inline void __DEFAULT_FN_ATTRS
4995_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
4996{
4997 __m128 __v128;
4998
4999 __v128 = _mm256_castps256_ps128(__a);
5000 _mm_storeu_ps(__addr_lo, __v128);
5001 __v128 = _mm256_extractf128_ps(__a, 1);
5002 _mm_storeu_ps(__addr_hi, __v128);
5003}
5004
5005/// Stores the upper and lower 128 bits of a 256-bit floating-point
5006/// vector of [4 x double] into two different unaligned memory locations.
5007///
5008/// \headerfile <x86intrin.h>
5009///
5010/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5011/// store instructions.
5012///
5013/// \param __addr_hi
5014/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5015/// copied to this memory location. The address of this memory location does
5016/// not have to be aligned.
5017/// \param __addr_lo
5018/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5019/// copied to this memory location. The address of this memory location does
5020/// not have to be aligned.
5021/// \param __a
5022/// A 256-bit floating-point vector of [4 x double].
5023static __inline void __DEFAULT_FN_ATTRS
5024_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5025{
5026 __m128d __v128;
5027
5028 __v128 = _mm256_castpd256_pd128(__a);
5029 _mm_storeu_pd(__addr_lo, __v128);
5030 __v128 = _mm256_extractf128_pd(__a, 1);
5031 _mm_storeu_pd(__addr_hi, __v128);
5032}
5033
5034/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5035/// two different unaligned memory locations.
5036///
5037/// \headerfile <x86intrin.h>
5038///
5039/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5040/// store instructions.
5041///
5042/// \param __addr_hi
5043/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5044/// copied to this memory location. The address of this memory location does
5045/// not have to be aligned.
5046/// \param __addr_lo
5047/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5048/// copied to this memory location. The address of this memory location does
5049/// not have to be aligned.
5050/// \param __a
5051/// A 256-bit integer vector.
5052static __inline void __DEFAULT_FN_ATTRS
5053_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5054{
5055 __m128i __v128;
5056
5057 __v128 = _mm256_castsi256_si128(__a);
5058 _mm_storeu_si128(__addr_lo, __v128);
5059 __v128 = _mm256_extractf128_si256(__a, 1);
5060 _mm_storeu_si128(__addr_hi, __v128);
5061}
5062
5063#undef __DEFAULT_FN_ATTRS
5064#undef __DEFAULT_FN_ATTRS_CONSTEXPR
5065#undef __DEFAULT_FN_ATTRS128
5066#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5067
5068#endif /* __AVXINTRIN_H */
__device__ _Float16
#define __DEFAULT_FN_ATTRS
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3014
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition avxintrin.h:169
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition avxintrin.h:3058
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition avxintrin.h:3264
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition avxintrin.h:825
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition avxintrin.h:3560
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition avxintrin.h:2934
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition avxintrin.h:3078
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4539
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition avxintrin.h:2264
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition avxintrin.h:3228
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition avxintrin.h:3284
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition avxintrin.h:2951
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition avxintrin.h:4919
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:351
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3374
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition avxintrin.h:575
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2585
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition avxintrin.h:3718
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4556
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2669
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition avxintrin.h:116
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:384
static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition avxintrin.h:2190
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition avxintrin.h:3608
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition avxintrin.h:304
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition avxintrin.h:969
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition avxintrin.h:3953
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4850
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3399
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3350
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition avxintrin.h:4337
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition avxintrin.h:186
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4892
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition avxintrin.h:3322
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition avxintrin.h:4722
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition avxintrin.h:4770
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition avxintrin.h:3171
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition avxintrin.h:4388
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition avxintrin.h:4320
static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition avxintrin.h:2331
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition avxintrin.h:3647
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition avxintrin.h:2356
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4872
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition avxintrin.h:132
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition avxintrin.h:1392
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition avxintrin.h:3540
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition avxintrin.h:3596
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:757
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition avxintrin.h:367
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition avxintrin.h:3766
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition avxintrin.h:2175
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition avxintrin.h:596
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition avxintrin.h:3472
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition avxintrin.h:4476
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition avxintrin.h:4170
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2499
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into four signed truncated (rounded toward zero) 32-bit int...
Definition avxintrin.h:2244
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition avxintrin.h:3620
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition avxintrin.h:2224
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition avxintrin.h:2208
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition avxintrin.h:4189
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition avxintrin.h:2473
static __inline __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition avxintrin.h:787
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition avxintrin.h:286
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3114
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition avxintrin.h:2161
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2865
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2612
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition avxintrin.h:2315
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition avxintrin.h:4746
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into eight signed truncated (rounded toward zero) 32-bit integers re...
Definition avxintrin.h:2284
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition avxintrin.h:4498
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:3036
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition avxintrin.h:244
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2697
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition avxintrin.h:4291
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition avxintrin.h:4208
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition avxintrin.h:3849
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2640
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition avxintrin.h:4946
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition avxintrin.h:82
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2916
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:668
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition avxintrin.h:5024
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition avxintrin.h:336
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2755
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2782
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:692
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition avxintrin.h:4151
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition avxintrin.h:4265
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition avxintrin.h:3496
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition avxintrin.h:3448
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition avxintrin.h:4118
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition avxintrin.h:4405
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3131
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition avxintrin.h:3686
static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition avxintrin.h:4422
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4573
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition avxintrin.h:98
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition avxintrin.h:4033
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition avxintrin.h:2381
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition avxintrin.h:2403
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition avxintrin.h:5053
static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition avxintrin.h:4439
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:614
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition avxintrin.h:4371
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2810
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition avxintrin.h:713
static __inline __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition avxintrin.h:879
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition avxintrin.h:650
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition avxintrin.h:632
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition avxintrin.h:151
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition avxintrin.h:736
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition avxintrin.h:1419
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition avxintrin.h:4970
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2527
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition avxintrin.h:2839
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition avxintrin.h:4279
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition avxintrin.h:3884
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition avxintrin.h:3423
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition avxintrin.h:320
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition avxintrin.h:3581
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition avxintrin.h:554
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3187
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition avxintrin.h:3305
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition avxintrin.h:265
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition avxintrin.h:4789
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition avxintrin.h:4354
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition avxintrin.h:4303
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition avxintrin.h:4519
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition avxintrin.h:202
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition avxintrin.h:3151
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition avxintrin.h:2992
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition avxintrin.h:3985
static __inline int __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2555
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition avxintrin.h:2447
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition avxintrin.h:2426
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition avxintrin.h:4226
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition avxintrin.h:4244
static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition avxintrin.h:4455
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition avxintrin.h:223
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition avxintrin.h:2725
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition avxintrin.h:3208
static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition avxintrin.h:2890
static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition avxintrin.h:2300
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition avxintrin.h:4828
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition avxintrin.h:3246
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition avxintrin.h:3913
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition avxintrin.h:4995
double __v4df __attribute__((__vector_size__(32)))
Definition avxintrin.h:17
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition avxintrin.h:3520
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition avxintrin.h:4809
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition avxintrin.h:3098
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition avxintrin.h:536
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition emmintrin.h:1619
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3878
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition emmintrin.h:3456
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition emmintrin.h:1980
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition emmintrin.h:3909
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:2097
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2018
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1860