clang 18.0.0git
avxintrin.h
Go to the documentation of this file.
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
54#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
55
56/* Arithmetic */
57/// Adds two 256-bit vectors of [4 x double].
58///
59/// \headerfile <x86intrin.h>
60///
61/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
62///
63/// \param __a
64/// A 256-bit vector of [4 x double] containing one of the source operands.
65/// \param __b
66/// A 256-bit vector of [4 x double] containing one of the source operands.
67/// \returns A 256-bit vector of [4 x double] containing the sums of both
68/// operands.
69static __inline __m256d __DEFAULT_FN_ATTRS
70_mm256_add_pd(__m256d __a, __m256d __b)
71{
72 return (__m256d)((__v4df)__a+(__v4df)__b);
73}
74
75/// Adds two 256-bit vectors of [8 x float].
76///
77/// \headerfile <x86intrin.h>
78///
79/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
80///
81/// \param __a
82/// A 256-bit vector of [8 x float] containing one of the source operands.
83/// \param __b
84/// A 256-bit vector of [8 x float] containing one of the source operands.
85/// \returns A 256-bit vector of [8 x float] containing the sums of both
86/// operands.
87static __inline __m256 __DEFAULT_FN_ATTRS
88_mm256_add_ps(__m256 __a, __m256 __b)
89{
90 return (__m256)((__v8sf)__a+(__v8sf)__b);
91}
92
93/// Subtracts two 256-bit vectors of [4 x double].
94///
95/// \headerfile <x86intrin.h>
96///
97/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
98///
99/// \param __a
100/// A 256-bit vector of [4 x double] containing the minuend.
101/// \param __b
102/// A 256-bit vector of [4 x double] containing the subtrahend.
103/// \returns A 256-bit vector of [4 x double] containing the differences between
104/// both operands.
105static __inline __m256d __DEFAULT_FN_ATTRS
106_mm256_sub_pd(__m256d __a, __m256d __b)
107{
108 return (__m256d)((__v4df)__a-(__v4df)__b);
109}
110
111/// Subtracts two 256-bit vectors of [8 x float].
112///
113/// \headerfile <x86intrin.h>
114///
115/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
116///
117/// \param __a
118/// A 256-bit vector of [8 x float] containing the minuend.
119/// \param __b
120/// A 256-bit vector of [8 x float] containing the subtrahend.
121/// \returns A 256-bit vector of [8 x float] containing the differences between
122/// both operands.
123static __inline __m256 __DEFAULT_FN_ATTRS
124_mm256_sub_ps(__m256 __a, __m256 __b)
125{
126 return (__m256)((__v8sf)__a-(__v8sf)__b);
127}
128
129/// Adds the even-indexed values and subtracts the odd-indexed values of
130/// two 256-bit vectors of [4 x double].
131///
132/// \headerfile <x86intrin.h>
133///
134/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
135///
136/// \param __a
137/// A 256-bit vector of [4 x double] containing the left source operand.
138/// \param __b
139/// A 256-bit vector of [4 x double] containing the right source operand.
140/// \returns A 256-bit vector of [4 x double] containing the alternating sums
141/// and differences between both operands.
142static __inline __m256d __DEFAULT_FN_ATTRS
143_mm256_addsub_pd(__m256d __a, __m256d __b)
144{
145 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
146}
147
148/// Adds the even-indexed values and subtracts the odd-indexed values of
149/// two 256-bit vectors of [8 x float].
150///
151/// \headerfile <x86intrin.h>
152///
153/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
154///
155/// \param __a
156/// A 256-bit vector of [8 x float] containing the left source operand.
157/// \param __b
158/// A 256-bit vector of [8 x float] containing the right source operand.
159/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
160/// differences between both operands.
161static __inline __m256 __DEFAULT_FN_ATTRS
162_mm256_addsub_ps(__m256 __a, __m256 __b)
163{
164 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
165}
166
167/// Divides two 256-bit vectors of [4 x double].
168///
169/// \headerfile <x86intrin.h>
170///
171/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
172///
173/// \param __a
174/// A 256-bit vector of [4 x double] containing the dividend.
175/// \param __b
176/// A 256-bit vector of [4 x double] containing the divisor.
177/// \returns A 256-bit vector of [4 x double] containing the quotients of both
178/// operands.
179static __inline __m256d __DEFAULT_FN_ATTRS
180_mm256_div_pd(__m256d __a, __m256d __b)
181{
182 return (__m256d)((__v4df)__a/(__v4df)__b);
183}
184
185/// Divides two 256-bit vectors of [8 x float].
186///
187/// \headerfile <x86intrin.h>
188///
189/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
190///
191/// \param __a
192/// A 256-bit vector of [8 x float] containing the dividend.
193/// \param __b
194/// A 256-bit vector of [8 x float] containing the divisor.
195/// \returns A 256-bit vector of [8 x float] containing the quotients of both
196/// operands.
197static __inline __m256 __DEFAULT_FN_ATTRS
198_mm256_div_ps(__m256 __a, __m256 __b)
199{
200 return (__m256)((__v8sf)__a/(__v8sf)__b);
201}
202
203/// Compares two 256-bit vectors of [4 x double] and returns the greater
204/// of each pair of values.
205///
206/// \headerfile <x86intrin.h>
207///
208/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
209///
210/// \param __a
211/// A 256-bit vector of [4 x double] containing one of the operands.
212/// \param __b
213/// A 256-bit vector of [4 x double] containing one of the operands.
214/// \returns A 256-bit vector of [4 x double] containing the maximum values
215/// between both operands.
216static __inline __m256d __DEFAULT_FN_ATTRS
217_mm256_max_pd(__m256d __a, __m256d __b)
218{
219 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
220}
221
222/// Compares two 256-bit vectors of [8 x float] and returns the greater
223/// of each pair of values.
224///
225/// \headerfile <x86intrin.h>
226///
227/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
228///
229/// \param __a
230/// A 256-bit vector of [8 x float] containing one of the operands.
231/// \param __b
232/// A 256-bit vector of [8 x float] containing one of the operands.
233/// \returns A 256-bit vector of [8 x float] containing the maximum values
234/// between both operands.
235static __inline __m256 __DEFAULT_FN_ATTRS
236_mm256_max_ps(__m256 __a, __m256 __b)
237{
238 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
239}
240
241/// Compares two 256-bit vectors of [4 x double] and returns the lesser
242/// of each pair of values.
243///
244/// \headerfile <x86intrin.h>
245///
246/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
247///
248/// \param __a
249/// A 256-bit vector of [4 x double] containing one of the operands.
250/// \param __b
251/// A 256-bit vector of [4 x double] containing one of the operands.
252/// \returns A 256-bit vector of [4 x double] containing the minimum values
253/// between both operands.
254static __inline __m256d __DEFAULT_FN_ATTRS
255_mm256_min_pd(__m256d __a, __m256d __b)
256{
257 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
258}
259
260/// Compares two 256-bit vectors of [8 x float] and returns the lesser
261/// of each pair of values.
262///
263/// \headerfile <x86intrin.h>
264///
265/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
266///
267/// \param __a
268/// A 256-bit vector of [8 x float] containing one of the operands.
269/// \param __b
270/// A 256-bit vector of [8 x float] containing one of the operands.
271/// \returns A 256-bit vector of [8 x float] containing the minimum values
272/// between both operands.
273static __inline __m256 __DEFAULT_FN_ATTRS
274_mm256_min_ps(__m256 __a, __m256 __b)
275{
276 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
277}
278
279/// Multiplies two 256-bit vectors of [4 x double].
280///
281/// \headerfile <x86intrin.h>
282///
283/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
284///
285/// \param __a
286/// A 256-bit vector of [4 x double] containing one of the operands.
287/// \param __b
288/// A 256-bit vector of [4 x double] containing one of the operands.
289/// \returns A 256-bit vector of [4 x double] containing the products of both
290/// operands.
291static __inline __m256d __DEFAULT_FN_ATTRS
292_mm256_mul_pd(__m256d __a, __m256d __b)
293{
294 return (__m256d)((__v4df)__a * (__v4df)__b);
295}
296
297/// Multiplies two 256-bit vectors of [8 x float].
298///
299/// \headerfile <x86intrin.h>
300///
301/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
302///
303/// \param __a
304/// A 256-bit vector of [8 x float] containing one of the operands.
305/// \param __b
306/// A 256-bit vector of [8 x float] containing one of the operands.
307/// \returns A 256-bit vector of [8 x float] containing the products of both
308/// operands.
309static __inline __m256 __DEFAULT_FN_ATTRS
310_mm256_mul_ps(__m256 __a, __m256 __b)
311{
312 return (__m256)((__v8sf)__a * (__v8sf)__b);
313}
314
315/// Calculates the square roots of the values in a 256-bit vector of
316/// [4 x double].
317///
318/// \headerfile <x86intrin.h>
319///
320/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
321///
322/// \param __a
323/// A 256-bit vector of [4 x double].
324/// \returns A 256-bit vector of [4 x double] containing the square roots of the
325/// values in the operand.
326static __inline __m256d __DEFAULT_FN_ATTRS
328{
329 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
330}
331
332/// Calculates the square roots of the values in a 256-bit vector of
333/// [8 x float].
334///
335/// \headerfile <x86intrin.h>
336///
337/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
338///
339/// \param __a
340/// A 256-bit vector of [8 x float].
341/// \returns A 256-bit vector of [8 x float] containing the square roots of the
342/// values in the operand.
343static __inline __m256 __DEFAULT_FN_ATTRS
345{
346 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
347}
348
349/// Calculates the reciprocal square roots of the values in a 256-bit
350/// vector of [8 x float].
351///
352/// \headerfile <x86intrin.h>
353///
354/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
355///
356/// \param __a
357/// A 256-bit vector of [8 x float].
358/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
359/// roots of the values in the operand.
360static __inline __m256 __DEFAULT_FN_ATTRS
362{
363 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
364}
365
366/// Calculates the reciprocals of the values in a 256-bit vector of
367/// [8 x float].
368///
369/// \headerfile <x86intrin.h>
370///
371/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
372///
373/// \param __a
374/// A 256-bit vector of [8 x float].
375/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
376/// values in the operand.
377static __inline __m256 __DEFAULT_FN_ATTRS
379{
380 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
381}
382
383/// Rounds the values in a 256-bit vector of [4 x double] as specified
384/// by the byte operand. The source values are rounded to integer values and
385/// returned as 64-bit double-precision floating-point values.
386///
387/// \headerfile <x86intrin.h>
388///
389/// \code
390/// __m256d _mm256_round_pd(__m256d V, const int M);
391/// \endcode
392///
393/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
394///
395/// \param V
396/// A 256-bit vector of [4 x double].
397/// \param M
398/// An integer value that specifies the rounding operation. \n
399/// Bits [7:4] are reserved. \n
400/// Bit [3] is a precision exception value: \n
401/// 0: A normal PE exception is used. \n
402/// 1: The PE field is not updated. \n
403/// Bit [2] is the rounding control source: \n
404/// 0: Use bits [1:0] of \a M. \n
405/// 1: Use the current MXCSR setting. \n
406/// Bits [1:0] contain the rounding control definition: \n
407/// 00: Nearest. \n
408/// 01: Downward (toward negative infinity). \n
409/// 10: Upward (toward positive infinity). \n
410/// 11: Truncated.
411/// \returns A 256-bit vector of [4 x double] containing the rounded values.
412#define _mm256_round_pd(V, M) \
413 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
414
415/// Rounds the values stored in a 256-bit vector of [8 x float] as
416/// specified by the byte operand. The source values are rounded to integer
417/// values and returned as floating-point values.
418///
419/// \headerfile <x86intrin.h>
420///
421/// \code
422/// __m256 _mm256_round_ps(__m256 V, const int M);
423/// \endcode
424///
425/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
426///
427/// \param V
428/// A 256-bit vector of [8 x float].
429/// \param M
430/// An integer value that specifies the rounding operation. \n
431/// Bits [7:4] are reserved. \n
432/// Bit [3] is a precision exception value: \n
433/// 0: A normal PE exception is used. \n
434/// 1: The PE field is not updated. \n
435/// Bit [2] is the rounding control source: \n
436/// 0: Use bits [1:0] of \a M. \n
437/// 1: Use the current MXCSR setting. \n
438/// Bits [1:0] contain the rounding control definition: \n
439/// 00: Nearest. \n
440/// 01: Downward (toward negative infinity). \n
441/// 10: Upward (toward positive infinity). \n
442/// 11: Truncated.
443/// \returns A 256-bit vector of [8 x float] containing the rounded values.
444#define _mm256_round_ps(V, M) \
445 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
446
447/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
448/// source values are rounded up to integer values and returned as 64-bit
449/// double-precision floating-point values.
450///
451/// \headerfile <x86intrin.h>
452///
453/// \code
454/// __m256d _mm256_ceil_pd(__m256d V);
455/// \endcode
456///
457/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
458///
459/// \param V
460/// A 256-bit vector of [4 x double].
461/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
462#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
463
464/// Rounds down the values stored in a 256-bit vector of [4 x double].
465/// The source values are rounded down to integer values and returned as
466/// 64-bit double-precision floating-point values.
467///
468/// \headerfile <x86intrin.h>
469///
470/// \code
471/// __m256d _mm256_floor_pd(__m256d V);
472/// \endcode
473///
474/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
475///
476/// \param V
477/// A 256-bit vector of [4 x double].
478/// \returns A 256-bit vector of [4 x double] containing the rounded down
479/// values.
480#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
481
482/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
483/// source values are rounded up to integer values and returned as
484/// floating-point values.
485///
486/// \headerfile <x86intrin.h>
487///
488/// \code
489/// __m256 _mm256_ceil_ps(__m256 V);
490/// \endcode
491///
492/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
493///
494/// \param V
495/// A 256-bit vector of [8 x float].
496/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
497#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
498
499/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
500/// source values are rounded down to integer values and returned as
501/// floating-point values.
502///
503/// \headerfile <x86intrin.h>
504///
505/// \code
506/// __m256 _mm256_floor_ps(__m256 V);
507/// \endcode
508///
509/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
510///
511/// \param V
512/// A 256-bit vector of [8 x float].
513/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
514#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
515
516/* Logical */
517/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
518///
519/// \headerfile <x86intrin.h>
520///
521/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
522///
523/// \param __a
524/// A 256-bit vector of [4 x double] containing one of the source operands.
525/// \param __b
526/// A 256-bit vector of [4 x double] containing one of the source operands.
527/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
528/// values between both operands.
529static __inline __m256d __DEFAULT_FN_ATTRS
530_mm256_and_pd(__m256d __a, __m256d __b)
531{
532 return (__m256d)((__v4du)__a & (__v4du)__b);
533}
534
535/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
536///
537/// \headerfile <x86intrin.h>
538///
539/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
540///
541/// \param __a
542/// A 256-bit vector of [8 x float] containing one of the source operands.
543/// \param __b
544/// A 256-bit vector of [8 x float] containing one of the source operands.
545/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
546/// values between both operands.
547static __inline __m256 __DEFAULT_FN_ATTRS
548_mm256_and_ps(__m256 __a, __m256 __b)
549{
550 return (__m256)((__v8su)__a & (__v8su)__b);
551}
552
553/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
554/// the one's complement of the values contained in the first source operand.
555///
556/// \headerfile <x86intrin.h>
557///
558/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
559///
560/// \param __a
561/// A 256-bit vector of [4 x double] containing the left source operand. The
562/// one's complement of this value is used in the bitwise AND.
563/// \param __b
564/// A 256-bit vector of [4 x double] containing the right source operand.
565/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
566/// values of the second operand and the one's complement of the first
567/// operand.
568static __inline __m256d __DEFAULT_FN_ATTRS
569_mm256_andnot_pd(__m256d __a, __m256d __b)
570{
571 return (__m256d)(~(__v4du)__a & (__v4du)__b);
572}
573
574/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
575/// the one's complement of the values contained in the first source operand.
576///
577/// \headerfile <x86intrin.h>
578///
579/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
580///
581/// \param __a
582/// A 256-bit vector of [8 x float] containing the left source operand. The
583/// one's complement of this value is used in the bitwise AND.
584/// \param __b
585/// A 256-bit vector of [8 x float] containing the right source operand.
586/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
587/// values of the second operand and the one's complement of the first
588/// operand.
589static __inline __m256 __DEFAULT_FN_ATTRS
590_mm256_andnot_ps(__m256 __a, __m256 __b)
591{
592 return (__m256)(~(__v8su)__a & (__v8su)__b);
593}
594
595/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
596///
597/// \headerfile <x86intrin.h>
598///
599/// This intrinsic corresponds to the <c> VORPD </c> instruction.
600///
601/// \param __a
602/// A 256-bit vector of [4 x double] containing one of the source operands.
603/// \param __b
604/// A 256-bit vector of [4 x double] containing one of the source operands.
605/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
606/// values between both operands.
607static __inline __m256d __DEFAULT_FN_ATTRS
608_mm256_or_pd(__m256d __a, __m256d __b)
609{
610 return (__m256d)((__v4du)__a | (__v4du)__b);
611}
612
613/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
614///
615/// \headerfile <x86intrin.h>
616///
617/// This intrinsic corresponds to the <c> VORPS </c> instruction.
618///
619/// \param __a
620/// A 256-bit vector of [8 x float] containing one of the source operands.
621/// \param __b
622/// A 256-bit vector of [8 x float] containing one of the source operands.
623/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
624/// values between both operands.
625static __inline __m256 __DEFAULT_FN_ATTRS
626_mm256_or_ps(__m256 __a, __m256 __b)
627{
628 return (__m256)((__v8su)__a | (__v8su)__b);
629}
630
631/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
632///
633/// \headerfile <x86intrin.h>
634///
635/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
636///
637/// \param __a
638/// A 256-bit vector of [4 x double] containing one of the source operands.
639/// \param __b
640/// A 256-bit vector of [4 x double] containing one of the source operands.
641/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
642/// values between both operands.
643static __inline __m256d __DEFAULT_FN_ATTRS
644_mm256_xor_pd(__m256d __a, __m256d __b)
645{
646 return (__m256d)((__v4du)__a ^ (__v4du)__b);
647}
648
649/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
650///
651/// \headerfile <x86intrin.h>
652///
653/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
654///
655/// \param __a
656/// A 256-bit vector of [8 x float] containing one of the source operands.
657/// \param __b
658/// A 256-bit vector of [8 x float] containing one of the source operands.
659/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
660/// values between both operands.
661static __inline __m256 __DEFAULT_FN_ATTRS
662_mm256_xor_ps(__m256 __a, __m256 __b)
663{
664 return (__m256)((__v8su)__a ^ (__v8su)__b);
665}
666
667/* Horizontal arithmetic */
668/// Horizontally adds the adjacent pairs of values contained in two
669/// 256-bit vectors of [4 x double].
670///
671/// \headerfile <x86intrin.h>
672///
673/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
674///
675/// \param __a
676/// A 256-bit vector of [4 x double] containing one of the source operands.
677/// The horizontal sums of the values are returned in the even-indexed
678/// elements of a vector of [4 x double].
679/// \param __b
680/// A 256-bit vector of [4 x double] containing one of the source operands.
681/// The horizontal sums of the values are returned in the odd-indexed
682/// elements of a vector of [4 x double].
683/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
684/// both operands.
685static __inline __m256d __DEFAULT_FN_ATTRS
686_mm256_hadd_pd(__m256d __a, __m256d __b)
687{
688 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
689}
690
691/// Horizontally adds the adjacent pairs of values contained in two
692/// 256-bit vectors of [8 x float].
693///
694/// \headerfile <x86intrin.h>
695///
696/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
697///
698/// \param __a
699/// A 256-bit vector of [8 x float] containing one of the source operands.
700/// The horizontal sums of the values are returned in the elements with
701/// index 0, 1, 4, 5 of a vector of [8 x float].
702/// \param __b
703/// A 256-bit vector of [8 x float] containing one of the source operands.
704/// The horizontal sums of the values are returned in the elements with
705/// index 2, 3, 6, 7 of a vector of [8 x float].
706/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
707/// both operands.
708static __inline __m256 __DEFAULT_FN_ATTRS
709_mm256_hadd_ps(__m256 __a, __m256 __b)
710{
711 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
712}
713
714/// Horizontally subtracts the adjacent pairs of values contained in two
715/// 256-bit vectors of [4 x double].
716///
717/// \headerfile <x86intrin.h>
718///
719/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
720///
721/// \param __a
722/// A 256-bit vector of [4 x double] containing one of the source operands.
723/// The horizontal differences between the values are returned in the
724/// even-indexed elements of a vector of [4 x double].
725/// \param __b
726/// A 256-bit vector of [4 x double] containing one of the source operands.
727/// The horizontal differences between the values are returned in the
728/// odd-indexed elements of a vector of [4 x double].
729/// \returns A 256-bit vector of [4 x double] containing the horizontal
730/// differences of both operands.
731static __inline __m256d __DEFAULT_FN_ATTRS
732_mm256_hsub_pd(__m256d __a, __m256d __b)
733{
734 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
735}
736
737/// Horizontally subtracts the adjacent pairs of values contained in two
738/// 256-bit vectors of [8 x float].
739///
740/// \headerfile <x86intrin.h>
741///
742/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
743///
744/// \param __a
745/// A 256-bit vector of [8 x float] containing one of the source operands.
746/// The horizontal differences between the values are returned in the
747/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
748/// \param __b
749/// A 256-bit vector of [8 x float] containing one of the source operands.
750/// The horizontal differences between the values are returned in the
751/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
752/// \returns A 256-bit vector of [8 x float] containing the horizontal
753/// differences of both operands.
754static __inline __m256 __DEFAULT_FN_ATTRS
755_mm256_hsub_ps(__m256 __a, __m256 __b)
756{
757 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
758}
759
760/* Vector permutations */
761/// Copies the values in a 128-bit vector of [2 x double] as specified
762/// by the 128-bit integer vector operand.
763///
764/// \headerfile <x86intrin.h>
765///
766/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
767///
768/// \param __a
769/// A 128-bit vector of [2 x double].
770/// \param __c
771/// A 128-bit integer vector operand specifying how the values are to be
772/// copied. \n
773/// Bit [1]: \n
774/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
775/// vector. \n
776/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
777/// returned vector. \n
778/// Bit [65]: \n
779/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
780/// returned vector. \n
781/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
782/// returned vector.
783/// \returns A 128-bit vector of [2 x double] containing the copied values.
784static __inline __m128d __DEFAULT_FN_ATTRS128
785_mm_permutevar_pd(__m128d __a, __m128i __c)
786{
787 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
788}
789
790/// Copies the values in a 256-bit vector of [4 x double] as specified
791/// by the 256-bit integer vector operand.
792///
793/// \headerfile <x86intrin.h>
794///
795/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
796///
797/// \param __a
798/// A 256-bit vector of [4 x double].
799/// \param __c
800/// A 256-bit integer vector operand specifying how the values are to be
801/// copied. \n
802/// Bit [1]: \n
803/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
804/// vector. \n
805/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
806/// returned vector. \n
807/// Bit [65]: \n
808/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
809/// returned vector. \n
810/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
811/// returned vector. \n
812/// Bit [129]: \n
813/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
814/// returned vector. \n
815/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
816/// returned vector. \n
817/// Bit [193]: \n
818/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
819/// returned vector. \n
820/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
821/// returned vector.
822/// \returns A 256-bit vector of [4 x double] containing the copied values.
823static __inline __m256d __DEFAULT_FN_ATTRS
824_mm256_permutevar_pd(__m256d __a, __m256i __c)
825{
826 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
827}
828
829/// Copies the values stored in a 128-bit vector of [4 x float] as
830/// specified by the 128-bit integer vector operand.
831/// \headerfile <x86intrin.h>
832///
833/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
834///
835/// \param __a
836/// A 128-bit vector of [4 x float].
837/// \param __c
838/// A 128-bit integer vector operand specifying how the values are to be
839/// copied. \n
840/// Bits [1:0]: \n
841/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
842/// returned vector. \n
843/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
844/// returned vector. \n
845/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
846/// returned vector. \n
847/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
848/// returned vector. \n
849/// Bits [33:32]: \n
850/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
851/// returned vector. \n
852/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
853/// returned vector. \n
854/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
855/// returned vector. \n
856/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
857/// returned vector. \n
858/// Bits [65:64]: \n
859/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
860/// returned vector. \n
861/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
862/// returned vector. \n
863/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
864/// returned vector. \n
865/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
866/// returned vector. \n
867/// Bits [97:96]: \n
868/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
869/// returned vector. \n
870/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
871/// returned vector. \n
872/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
873/// returned vector. \n
874/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
875/// returned vector.
876/// \returns A 128-bit vector of [4 x float] containing the copied values.
877static __inline __m128 __DEFAULT_FN_ATTRS128
878_mm_permutevar_ps(__m128 __a, __m128i __c)
879{
880 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
881}
882
883/// Copies the values stored in a 256-bit vector of [8 x float] as
884/// specified by the 256-bit integer vector operand.
885///
886/// \headerfile <x86intrin.h>
887///
888/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
889///
890/// \param __a
891/// A 256-bit vector of [8 x float].
892/// \param __c
893/// A 256-bit integer vector operand specifying how the values are to be
894/// copied. \n
895/// Bits [1:0]: \n
896/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
897/// returned vector. \n
898/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
899/// returned vector. \n
900/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
901/// returned vector. \n
902/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
903/// returned vector. \n
904/// Bits [33:32]: \n
905/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
906/// returned vector. \n
907/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
908/// returned vector. \n
909/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
910/// returned vector. \n
911/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
912/// returned vector. \n
913/// Bits [65:64]: \n
914/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
915/// returned vector. \n
916/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
917/// returned vector. \n
918/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
919/// returned vector. \n
920/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
921/// returned vector. \n
922/// Bits [97:96]: \n
923/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
924/// returned vector. \n
925/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
926/// returned vector. \n
927/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
928/// returned vector. \n
929/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
930/// returned vector. \n
931/// Bits [129:128]: \n
932/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
933/// returned vector. \n
934/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
935/// returned vector. \n
936/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
937/// returned vector. \n
938/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
939/// returned vector. \n
940/// Bits [161:160]: \n
941/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
942/// returned vector. \n
943/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
944/// returned vector. \n
945/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
946/// returned vector. \n
947/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
948/// returned vector. \n
949/// Bits [193:192]: \n
950/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
951/// returned vector. \n
952/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
953/// returned vector. \n
954/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
955/// returned vector. \n
956/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
957/// returned vector. \n
958/// Bits [225:224]: \n
959/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
960/// returned vector. \n
961/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
962/// returned vector. \n
963/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
964/// returned vector. \n
965/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
966/// returned vector.
967/// \returns A 256-bit vector of [8 x float] containing the copied values.
968static __inline __m256 __DEFAULT_FN_ATTRS
970{
971 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
972}
973
974/// Copies the values in a 128-bit vector of [2 x double] as specified
975/// by the immediate integer operand.
976///
977/// \headerfile <x86intrin.h>
978///
979/// \code
980/// __m128d _mm_permute_pd(__m128d A, const int C);
981/// \endcode
982///
983/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
984///
985/// \param A
986/// A 128-bit vector of [2 x double].
987/// \param C
988/// An immediate integer operand specifying how the values are to be
989/// copied. \n
990/// Bit [0]: \n
991/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
992/// vector. \n
993/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
994/// returned vector. \n
995/// Bit [1]: \n
996/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
997/// returned vector. \n
998/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
999/// returned vector.
1000/// \returns A 128-bit vector of [2 x double] containing the copied values.
1001#define _mm_permute_pd(A, C) \
1002 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1003
1004/// Copies the values in a 256-bit vector of [4 x double] as specified by
1005/// the immediate integer operand.
1006///
1007/// \headerfile <x86intrin.h>
1008///
1009/// \code
1010/// __m256d _mm256_permute_pd(__m256d A, const int C);
1011/// \endcode
1012///
1013/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1014///
1015/// \param A
1016/// A 256-bit vector of [4 x double].
1017/// \param C
1018/// An immediate integer operand specifying how the values are to be
1019/// copied. \n
1020/// Bit [0]: \n
1021/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1022/// vector. \n
1023/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1024/// returned vector. \n
1025/// Bit [1]: \n
1026/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1027/// returned vector. \n
1028/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1029/// returned vector. \n
1030/// Bit [2]: \n
1031/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1032/// returned vector. \n
1033/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1034/// returned vector. \n
1035/// Bit [3]: \n
1036/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1037/// returned vector. \n
1038/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1039/// returned vector.
1040/// \returns A 256-bit vector of [4 x double] containing the copied values.
1041#define _mm256_permute_pd(A, C) \
1042 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1043
1044/// Copies the values in a 128-bit vector of [4 x float] as specified by
1045/// the immediate integer operand.
1046///
1047/// \headerfile <x86intrin.h>
1048///
1049/// \code
1050/// __m128 _mm_permute_ps(__m128 A, const int C);
1051/// \endcode
1052///
1053/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1054///
1055/// \param A
1056/// A 128-bit vector of [4 x float].
1057/// \param C
1058/// An immediate integer operand specifying how the values are to be
1059/// copied. \n
1060/// Bits [1:0]: \n
1061/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1062/// returned vector. \n
1063/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1064/// returned vector. \n
1065/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1066/// returned vector. \n
1067/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1068/// returned vector. \n
1069/// Bits [3:2]: \n
1070/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1071/// returned vector. \n
1072/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1073/// returned vector. \n
1074/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1075/// returned vector. \n
1076/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1077/// returned vector. \n
1078/// Bits [5:4]: \n
1079/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1080/// returned vector. \n
1081/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1082/// returned vector. \n
1083/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1084/// returned vector. \n
1085/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1086/// returned vector. \n
1087/// Bits [7:6]: \n
1088/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1089/// returned vector. \n
1090/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1091/// returned vector. \n
1092/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1093/// returned vector. \n
1094/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1095/// returned vector.
1096/// \returns A 128-bit vector of [4 x float] containing the copied values.
1097#define _mm_permute_ps(A, C) \
1098 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1099
1100/// Copies the values in a 256-bit vector of [8 x float] as specified by
1101/// the immediate integer operand.
1102///
1103/// \headerfile <x86intrin.h>
1104///
1105/// \code
1106/// __m256 _mm256_permute_ps(__m256 A, const int C);
1107/// \endcode
1108///
1109/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1110///
1111/// \param A
1112/// A 256-bit vector of [8 x float].
1113/// \param C
1114/// An immediate integer operand specifying how the values are to be
1115/// copied. \n
1116/// Bits [1:0]: \n
1117/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1118/// returned vector. \n
1119/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1120/// returned vector. \n
1121/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1122/// returned vector. \n
1123/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1124/// returned vector. \n
1125/// Bits [3:2]: \n
1126/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1127/// returned vector. \n
1128/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1129/// returned vector. \n
1130/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1131/// returned vector. \n
1132/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1133/// returned vector. \n
1134/// Bits [5:4]: \n
1135/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1136/// returned vector. \n
1137/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1138/// returned vector. \n
1139/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1140/// returned vector. \n
1141/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1142/// returned vector. \n
1143/// Bits [7:6]: \n
1144/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1145/// returned vector. \n
1146/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1147/// returned vector. \n
1148/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1149/// returned vector. \n
1150/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1151/// returned vector. \n
1152/// Bits [1:0]: \n
1153/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1154/// returned vector. \n
1155/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1156/// returned vector. \n
1157/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1158/// returned vector. \n
1159/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1160/// returned vector. \n
1161/// Bits [3:2]: \n
1162/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1163/// returned vector. \n
1164/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1165/// returned vector. \n
1166/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1167/// returned vector. \n
1168/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1169/// returned vector. \n
1170/// Bits [5:4]: \n
1171/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1172/// returned vector. \n
1173/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1174/// returned vector. \n
1175/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1176/// returned vector. \n
1177/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1178/// returned vector. \n
1179/// Bits [7:6]: \n
1180/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1181/// returned vector. \n
1182/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1183/// returned vector. \n
1184/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1185/// returned vector. \n
1186/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1187/// returned vector.
1188/// \returns A 256-bit vector of [8 x float] containing the copied values.
1189#define _mm256_permute_ps(A, C) \
1190 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1191
1192/// Permutes 128-bit data values stored in two 256-bit vectors of
1193/// [4 x double], as specified by the immediate integer operand.
1194///
1195/// \headerfile <x86intrin.h>
1196///
1197/// \code
1198/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1199/// \endcode
1200///
1201/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1202///
1203/// \param V1
1204/// A 256-bit vector of [4 x double].
1205/// \param V2
1206/// A 256-bit vector of [4 x double.
1207/// \param M
1208/// An immediate integer operand specifying how the values are to be
1209/// permuted. \n
1210/// Bits [1:0]: \n
1211/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1212/// destination. \n
1213/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1214/// destination. \n
1215/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1216/// destination. \n
1217/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1218/// destination. \n
1219/// Bits [5:4]: \n
1220/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1221/// destination. \n
1222/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1223/// destination. \n
1224/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1225/// destination. \n
1226/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1227/// destination.
1228/// \returns A 256-bit vector of [4 x double] containing the copied values.
1229#define _mm256_permute2f128_pd(V1, V2, M) \
1230 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1231 (__v4df)(__m256d)(V2), (int)(M)))
1232
1233/// Permutes 128-bit data values stored in two 256-bit vectors of
1234/// [8 x float], as specified by the immediate integer operand.
1235///
1236/// \headerfile <x86intrin.h>
1237///
1238/// \code
1239/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1240/// \endcode
1241///
1242/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1243///
1244/// \param V1
1245/// A 256-bit vector of [8 x float].
1246/// \param V2
1247/// A 256-bit vector of [8 x float].
1248/// \param M
1249/// An immediate integer operand specifying how the values are to be
1250/// permuted. \n
1251/// Bits [1:0]: \n
1252/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1253/// destination. \n
1254/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1255/// destination. \n
1256/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1257/// destination. \n
1258/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1259/// destination. \n
1260/// Bits [5:4]: \n
1261/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1262/// destination. \n
1263/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1264/// destination. \n
1265/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1266/// destination. \n
1267/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1268/// destination.
1269/// \returns A 256-bit vector of [8 x float] containing the copied values.
1270#define _mm256_permute2f128_ps(V1, V2, M) \
1271 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1272 (__v8sf)(__m256)(V2), (int)(M)))
1273
1274/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1275/// as specified by the immediate integer operand.
1276///
1277/// \headerfile <x86intrin.h>
1278///
1279/// \code
1280/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1281/// \endcode
1282///
1283/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1284///
1285/// \param V1
1286/// A 256-bit integer vector.
1287/// \param V2
1288/// A 256-bit integer vector.
1289/// \param M
1290/// An immediate integer operand specifying how the values are to be copied.
1291/// Bits [1:0]: \n
1292/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1293/// destination. \n
1294/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1295/// destination. \n
1296/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1297/// destination. \n
1298/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1299/// destination. \n
1300/// Bits [5:4]: \n
1301/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1302/// destination. \n
1303/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1304/// destination. \n
1305/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1306/// destination. \n
1307/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1308/// destination.
1309/// \returns A 256-bit integer vector containing the copied values.
1310#define _mm256_permute2f128_si256(V1, V2, M) \
1311 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1312 (__v8si)(__m256i)(V2), (int)(M)))
1313
1314/* Vector Blend */
1315/// Merges 64-bit double-precision data values stored in either of the
1316/// two 256-bit vectors of [4 x double], as specified by the immediate
1317/// integer operand.
1318///
1319/// \headerfile <x86intrin.h>
1320///
1321/// \code
1322/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1323/// \endcode
1324///
1325/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1326///
1327/// \param V1
1328/// A 256-bit vector of [4 x double].
1329/// \param V2
1330/// A 256-bit vector of [4 x double].
1331/// \param M
1332/// An immediate integer operand, with mask bits [3:0] specifying how the
1333/// values are to be copied. The position of the mask bit corresponds to the
1334/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1335/// element in operand \a V1 is copied to the same position in the
1336/// destination. When a mask bit is 1, the corresponding 64-bit element in
1337/// operand \a V2 is copied to the same position in the destination.
1338/// \returns A 256-bit vector of [4 x double] containing the copied values.
1339#define _mm256_blend_pd(V1, V2, M) \
1340 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1341 (__v4df)(__m256d)(V2), (int)(M)))
1342
1343/// Merges 32-bit single-precision data values stored in either of the
1344/// two 256-bit vectors of [8 x float], as specified by the immediate
1345/// integer operand.
1346///
1347/// \headerfile <x86intrin.h>
1348///
1349/// \code
1350/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1351/// \endcode
1352///
1353/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1354///
1355/// \param V1
1356/// A 256-bit vector of [8 x float].
1357/// \param V2
1358/// A 256-bit vector of [8 x float].
1359/// \param M
1360/// An immediate integer operand, with mask bits [7:0] specifying how the
1361/// values are to be copied. The position of the mask bit corresponds to the
1362/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1363/// element in operand \a V1 is copied to the same position in the
1364/// destination. When a mask bit is 1, the corresponding 32-bit element in
1365/// operand \a V2 is copied to the same position in the destination.
1366/// \returns A 256-bit vector of [8 x float] containing the copied values.
1367#define _mm256_blend_ps(V1, V2, M) \
1368 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1369 (__v8sf)(__m256)(V2), (int)(M)))
1370
1371/// Merges 64-bit double-precision data values stored in either of the
1372/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1373/// operand.
1374///
1375/// \headerfile <x86intrin.h>
1376///
1377/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1378///
1379/// \param __a
1380/// A 256-bit vector of [4 x double].
1381/// \param __b
1382/// A 256-bit vector of [4 x double].
1383/// \param __c
1384/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1385/// how the values are to be copied. The position of the mask bit corresponds
1386/// to the most significant bit of a copied value. When a mask bit is 0, the
1387/// corresponding 64-bit element in operand \a __a is copied to the same
1388/// position in the destination. When a mask bit is 1, the corresponding
1389/// 64-bit element in operand \a __b is copied to the same position in the
1390/// destination.
1391/// \returns A 256-bit vector of [4 x double] containing the copied values.
1392static __inline __m256d __DEFAULT_FN_ATTRS
1393_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1394{
1395 return (__m256d)__builtin_ia32_blendvpd256(
1396 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1397}
1398
1399/// Merges 32-bit single-precision data values stored in either of the
1400/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1401/// operand.
1402///
1403/// \headerfile <x86intrin.h>
1404///
1405/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1406///
1407/// \param __a
1408/// A 256-bit vector of [8 x float].
1409/// \param __b
1410/// A 256-bit vector of [8 x float].
1411/// \param __c
1412/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1413/// and 31 specifying how the values are to be copied. The position of the
1414/// mask bit corresponds to the most significant bit of a copied value. When
1415/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1416/// copied to the same position in the destination. When a mask bit is 1, the
1417/// corresponding 32-bit element in operand \a __b is copied to the same
1418/// position in the destination.
1419/// \returns A 256-bit vector of [8 x float] containing the copied values.
1420static __inline __m256 __DEFAULT_FN_ATTRS
1421_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1422{
1423 return (__m256)__builtin_ia32_blendvps256(
1424 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1425}
1426
1427/* Vector Dot Product */
1428/// Computes two dot products in parallel, using the lower and upper
1429/// halves of two [8 x float] vectors as input to the two computations, and
1430/// returning the two dot products in the lower and upper halves of the
1431/// [8 x float] result.
1432///
1433/// The immediate integer operand controls which input elements will
1434/// contribute to the dot product, and where the final results are returned.
1435/// In general, for each dot product, the four corresponding elements of the
1436/// input vectors are multiplied; the first two and second two products are
1437/// summed, then the two sums are added to form the final result.
1438///
1439/// \headerfile <x86intrin.h>
1440///
1441/// \code
1442/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1443/// \endcode
1444///
1445/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1446///
1447/// \param V1
1448/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1449/// \param V2
1450/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1451/// \param M
1452/// An immediate integer argument. Bits [7:4] determine which elements of
1453/// the input vectors are used, with bit [4] corresponding to the lowest
1454/// element and bit [7] corresponding to the highest element of each [4 x
1455/// float] subvector. If a bit is set, the corresponding elements from the
1456/// two input vectors are used as an input for dot product; otherwise that
1457/// input is treated as zero. Bits [3:0] determine which elements of the
1458/// result will receive a copy of the final dot product, with bit [0]
1459/// corresponding to the lowest element and bit [3] corresponding to the
1460/// highest element of each [4 x float] subvector. If a bit is set, the dot
1461/// product is returned in the corresponding element; otherwise that element
1462/// is set to zero. The bitmask is applied in the same way to each of the
1463/// two parallel dot product computations.
1464/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1465#define _mm256_dp_ps(V1, V2, M) \
1466 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1467 (__v8sf)(__m256)(V2), (M)))
1468
1469/* Vector shuffle */
1470/// Selects 8 float values from the 256-bit operands of [8 x float], as
1471/// specified by the immediate value operand.
1472///
1473/// The four selected elements in each operand are copied to the destination
1474/// according to the bits specified in the immediate operand. The selected
1475/// elements from the first 256-bit operand are copied to bits [63:0] and
1476/// bits [191:128] of the destination, and the selected elements from the
1477/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1478/// the destination. For example, if bits [7:0] of the immediate operand
1479/// contain a value of 0xFF, the 256-bit destination vector would contain the
1480/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1481///
1482/// \headerfile <x86intrin.h>
1483///
1484/// \code
1485/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1486/// \endcode
1487///
1488/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1489///
1490/// \param a
1491/// A 256-bit vector of [8 x float]. The four selected elements in this
1492/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1493/// according to the bits specified in the immediate operand.
1494/// \param b
1495/// A 256-bit vector of [8 x float]. The four selected elements in this
1496/// operand are copied to bits [127:64] and bits [255:192] in the
1497/// destination, according to the bits specified in the immediate operand.
1498/// \param mask
1499/// An immediate value containing an 8-bit value specifying which elements to
1500/// copy from \a a and \a b \n.
1501/// Bits [3:0] specify the values copied from operand \a a. \n
1502/// Bits [7:4] specify the values copied from operand \a b. \n
1503/// The destinations within the 256-bit destination are assigned values as
1504/// follows, according to the bit value assignments described below: \n
1505/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1506/// destination. \n
1507/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1508/// destination. \n
1509/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1510/// destination. \n
1511/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1512/// the destination. \n
1513/// Bit value assignments: \n
1514/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1515/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1516/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1517/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1518/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1519/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1520/// <c>[b6, b4, b2, b0]</c>.
1521/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1522#define _mm256_shuffle_ps(a, b, mask) \
1523 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1524 (__v8sf)(__m256)(b), (int)(mask)))
1525
1526/// Selects four double-precision values from the 256-bit operands of
1527/// [4 x double], as specified by the immediate value operand.
1528///
1529/// The selected elements from the first 256-bit operand are copied to bits
1530/// [63:0] and bits [191:128] in the destination, and the selected elements
1531/// from the second 256-bit operand are copied to bits [127:64] and bits
1532/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1533/// operand contain a value of 0xF, the 256-bit destination vector would
1534/// contain the following values: b[3], a[3], b[1], a[1].
1535///
1536/// \headerfile <x86intrin.h>
1537///
1538/// \code
1539/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1540/// \endcode
1541///
1542/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1543///
1544/// \param a
1545/// A 256-bit vector of [4 x double].
1546/// \param b
1547/// A 256-bit vector of [4 x double].
1548/// \param mask
1549/// An immediate value containing 8-bit values specifying which elements to
1550/// copy from \a a and \a b: \n
1551/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1552/// destination. \n
1553/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1554/// destination. \n
1555/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1556/// destination. \n
1557/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1558/// destination. \n
1559/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1560/// destination. \n
1561/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1562/// destination. \n
1563/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1564/// destination. \n
1565/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1566/// destination.
1567/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1568#define _mm256_shuffle_pd(a, b, mask) \
1569 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1570 (__v4df)(__m256d)(b), (int)(mask)))
1571
1572#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1573#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1574#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1575#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1576#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1577#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1578#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1579#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1580#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1581#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1582#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1583#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1584#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1585#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1586#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1587#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1588#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1589#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1590#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1591#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1592#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1593#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1594#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1595#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1596
1597/// Compares each of the corresponding double-precision values of two
1598/// 256-bit vectors of [4 x double], using the operation specified by the
1599/// immediate integer operand.
1600///
1601/// Returns a [4 x double] vector consisting of four doubles corresponding to
1602/// the four comparison results: zero if the comparison is false, and all 1's
1603/// if the comparison is true.
1604///
1605/// \headerfile <x86intrin.h>
1606///
1607/// \code
1608/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1609/// \endcode
1610///
1611/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1612///
1613/// \param a
1614/// A 256-bit vector of [4 x double].
1615/// \param b
1616/// A 256-bit vector of [4 x double].
1617/// \param c
1618/// An immediate integer operand, with bits [4:0] specifying which comparison
1619/// operation to use: \n
1620/// 0x00: Equal (ordered, non-signaling) \n
1621/// 0x01: Less-than (ordered, signaling) \n
1622/// 0x02: Less-than-or-equal (ordered, signaling) \n
1623/// 0x03: Unordered (non-signaling) \n
1624/// 0x04: Not-equal (unordered, non-signaling) \n
1625/// 0x05: Not-less-than (unordered, signaling) \n
1626/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1627/// 0x07: Ordered (non-signaling) \n
1628/// 0x08: Equal (unordered, non-signaling) \n
1629/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1630/// 0x0A: Not-greater-than (unordered, signaling) \n
1631/// 0x0B: False (ordered, non-signaling) \n
1632/// 0x0C: Not-equal (ordered, non-signaling) \n
1633/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1634/// 0x0E: Greater-than (ordered, signaling) \n
1635/// 0x0F: True (unordered, non-signaling) \n
1636/// 0x10: Equal (ordered, signaling) \n
1637/// 0x11: Less-than (ordered, non-signaling) \n
1638/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1639/// 0x13: Unordered (signaling) \n
1640/// 0x14: Not-equal (unordered, signaling) \n
1641/// 0x15: Not-less-than (unordered, non-signaling) \n
1642/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1643/// 0x17: Ordered (signaling) \n
1644/// 0x18: Equal (unordered, signaling) \n
1645/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1646/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1647/// 0x1B: False (ordered, signaling) \n
1648/// 0x1C: Not-equal (ordered, signaling) \n
1649/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1650/// 0x1E: Greater-than (ordered, non-signaling) \n
1651/// 0x1F: True (unordered, signaling)
1652/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1653#define _mm256_cmp_pd(a, b, c) \
1654 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1655 (__v4df)(__m256d)(b), (c)))
1656
1657/// Compares each of the corresponding values of two 256-bit vectors of
1658/// [8 x float], using the operation specified by the immediate integer
1659/// operand.
1660///
1661/// Returns a [8 x float] vector consisting of eight floats corresponding to
1662/// the eight comparison results: zero if the comparison is false, and all
1663/// 1's if the comparison is true.
1664///
1665/// \headerfile <x86intrin.h>
1666///
1667/// \code
1668/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1669/// \endcode
1670///
1671/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1672///
1673/// \param a
1674/// A 256-bit vector of [8 x float].
1675/// \param b
1676/// A 256-bit vector of [8 x float].
1677/// \param c
1678/// An immediate integer operand, with bits [4:0] specifying which comparison
1679/// operation to use: \n
1680/// 0x00: Equal (ordered, non-signaling) \n
1681/// 0x01: Less-than (ordered, signaling) \n
1682/// 0x02: Less-than-or-equal (ordered, signaling) \n
1683/// 0x03: Unordered (non-signaling) \n
1684/// 0x04: Not-equal (unordered, non-signaling) \n
1685/// 0x05: Not-less-than (unordered, signaling) \n
1686/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1687/// 0x07: Ordered (non-signaling) \n
1688/// 0x08: Equal (unordered, non-signaling) \n
1689/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1690/// 0x0A: Not-greater-than (unordered, signaling) \n
1691/// 0x0B: False (ordered, non-signaling) \n
1692/// 0x0C: Not-equal (ordered, non-signaling) \n
1693/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1694/// 0x0E: Greater-than (ordered, signaling) \n
1695/// 0x0F: True (unordered, non-signaling) \n
1696/// 0x10: Equal (ordered, signaling) \n
1697/// 0x11: Less-than (ordered, non-signaling) \n
1698/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1699/// 0x13: Unordered (signaling) \n
1700/// 0x14: Not-equal (unordered, signaling) \n
1701/// 0x15: Not-less-than (unordered, non-signaling) \n
1702/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1703/// 0x17: Ordered (signaling) \n
1704/// 0x18: Equal (unordered, signaling) \n
1705/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1706/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1707/// 0x1B: False (ordered, signaling) \n
1708/// 0x1C: Not-equal (ordered, signaling) \n
1709/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1710/// 0x1E: Greater-than (ordered, non-signaling) \n
1711/// 0x1F: True (unordered, signaling)
1712/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1713#define _mm256_cmp_ps(a, b, c) \
1714 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1715 (__v8sf)(__m256)(b), (c)))
1716
1717/// Takes a [8 x i32] vector and returns the vector element value
1718/// indexed by the immediate constant operand.
1719///
1720/// \headerfile <x86intrin.h>
1721///
1722/// \code
1723/// int _mm256_extract_epi32(__m256i X, const int N);
1724/// \endcode
1725///
1726/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1727/// instruction.
1728///
1729/// \param X
1730/// A 256-bit vector of [8 x i32].
1731/// \param N
1732/// An immediate integer operand with bits [2:0] determining which vector
1733/// element is extracted and returned.
1734/// \returns A 32-bit integer containing the extracted 32 bits of extended
1735/// packed data.
1736#define _mm256_extract_epi32(X, N) \
1737 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1738
1739/// Takes a [16 x i16] vector and returns the vector element value
1740/// indexed by the immediate constant operand.
1741///
1742/// \headerfile <x86intrin.h>
1743///
1744/// \code
1745/// int _mm256_extract_epi16(__m256i X, const int N);
1746/// \endcode
1747///
1748/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1749/// instruction.
1750///
1751/// \param X
1752/// A 256-bit integer vector of [16 x i16].
1753/// \param N
1754/// An immediate integer operand with bits [3:0] determining which vector
1755/// element is extracted and returned.
1756/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
1757/// packed data.
1758#define _mm256_extract_epi16(X, N) \
1759 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
1760 (int)(N)))
1761
1762/// Takes a [32 x i8] vector and returns the vector element value
1763/// indexed by the immediate constant operand.
1764///
1765/// \headerfile <x86intrin.h>
1766///
1767/// \code
1768/// int _mm256_extract_epi8(__m256i X, const int N);
1769/// \endcode
1770///
1771/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1772/// instruction.
1773///
1774/// \param X
1775/// A 256-bit integer vector of [32 x i8].
1776/// \param N
1777/// An immediate integer operand with bits [4:0] determining which vector
1778/// element is extracted and returned.
1779/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
1780/// packed data.
1781#define _mm256_extract_epi8(X, N) \
1782 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
1783 (int)(N)))
1784
1785#ifdef __x86_64__
1786/// Takes a [4 x i64] vector and returns the vector element value
1787/// indexed by the immediate constant operand.
1788///
1789/// \headerfile <x86intrin.h>
1790///
1791/// \code
1792/// long long _mm256_extract_epi64(__m256i X, const int N);
1793/// \endcode
1794///
1795/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1796/// instruction.
1797///
1798/// \param X
1799/// A 256-bit integer vector of [4 x i64].
1800/// \param N
1801/// An immediate integer operand with bits [1:0] determining which vector
1802/// element is extracted and returned.
1803/// \returns A 64-bit integer containing the extracted 64 bits of extended
1804/// packed data.
1805#define _mm256_extract_epi64(X, N) \
1806 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
1807#endif
1808
1809/// Takes a [8 x i32] vector and replaces the vector element value
1810/// indexed by the immediate constant operand by a new value. Returns the
1811/// modified vector.
1812///
1813/// \headerfile <x86intrin.h>
1814///
1815/// \code
1816/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
1817/// \endcode
1818///
1819/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
1820/// instruction.
1821///
1822/// \param X
1823/// A vector of [8 x i32] to be used by the insert operation.
1824/// \param I
1825/// An integer value. The replacement value for the insert operation.
1826/// \param N
1827/// An immediate integer specifying the index of the vector element to be
1828/// replaced.
1829/// \returns A copy of vector \a X, after replacing its element indexed by
1830/// \a N with \a I.
1831#define _mm256_insert_epi32(X, I, N) \
1832 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
1833 (int)(I), (int)(N)))
1834
1835
1836/// Takes a [16 x i16] vector and replaces the vector element value
1837/// indexed by the immediate constant operand with a new value. Returns the
1838/// modified vector.
1839///
1840/// \headerfile <x86intrin.h>
1841///
1842/// \code
1843/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
1844/// \endcode
1845///
1846/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
1847/// instruction.
1848///
1849/// \param X
1850/// A vector of [16 x i16] to be used by the insert operation.
1851/// \param I
1852/// An i16 integer value. The replacement value for the insert operation.
1853/// \param N
1854/// An immediate integer specifying the index of the vector element to be
1855/// replaced.
1856/// \returns A copy of vector \a X, after replacing its element indexed by
1857/// \a N with \a I.
1858#define _mm256_insert_epi16(X, I, N) \
1859 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
1860 (int)(I), (int)(N)))
1861
1862/// Takes a [32 x i8] vector and replaces the vector element value
1863/// indexed by the immediate constant operand with a new value. Returns the
1864/// modified vector.
1865///
1866/// \headerfile <x86intrin.h>
1867///
1868/// \code
1869/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
1870/// \endcode
1871///
1872/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
1873/// instruction.
1874///
1875/// \param X
1876/// A vector of [32 x i8] to be used by the insert operation.
1877/// \param I
1878/// An i8 integer value. The replacement value for the insert operation.
1879/// \param N
1880/// An immediate integer specifying the index of the vector element to be
1881/// replaced.
1882/// \returns A copy of vector \a X, after replacing its element indexed by
1883/// \a N with \a I.
1884#define _mm256_insert_epi8(X, I, N) \
1885 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
1886 (int)(I), (int)(N)))
1887
1888#ifdef __x86_64__
1889/// Takes a [4 x i64] vector and replaces the vector element value
1890/// indexed by the immediate constant operand with a new value. Returns the
1891/// modified vector.
1892///
1893/// \headerfile <x86intrin.h>
1894///
1895/// \code
1896/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
1897/// \endcode
1898///
1899/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
1900/// instruction.
1901///
1902/// \param X
1903/// A vector of [4 x i64] to be used by the insert operation.
1904/// \param I
1905/// A 64-bit integer value. The replacement value for the insert operation.
1906/// \param N
1907/// An immediate integer specifying the index of the vector element to be
1908/// replaced.
1909/// \returns A copy of vector \a X, after replacing its element indexed by
1910/// \a N with \a I.
1911#define _mm256_insert_epi64(X, I, N) \
1912 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
1913 (long long)(I), (int)(N)))
1914#endif
1915
1916/* Conversion */
1917/// Converts a vector of [4 x i32] into a vector of [4 x double].
1918///
1919/// \headerfile <x86intrin.h>
1920///
1921/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
1922///
1923/// \param __a
1924/// A 128-bit integer vector of [4 x i32].
1925/// \returns A 256-bit vector of [4 x double] containing the converted values.
1926static __inline __m256d __DEFAULT_FN_ATTRS
1928{
1929 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
1930}
1931
1932/// Converts a vector of [8 x i32] into a vector of [8 x float].
1933///
1934/// \headerfile <x86intrin.h>
1935///
1936/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
1937///
1938/// \param __a
1939/// A 256-bit integer vector.
1940/// \returns A 256-bit vector of [8 x float] containing the converted values.
1941static __inline __m256 __DEFAULT_FN_ATTRS
1943{
1944 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
1945}
1946
1947/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
1948/// [4 x float].
1949///
1950/// \headerfile <x86intrin.h>
1951///
1952/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
1953///
1954/// \param __a
1955/// A 256-bit vector of [4 x double].
1956/// \returns A 128-bit vector of [4 x float] containing the converted values.
1957static __inline __m128 __DEFAULT_FN_ATTRS
1959{
1960 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
1961}
1962
1963/// Converts a vector of [8 x float] into a vector of [8 x i32].
1964///
1965/// \headerfile <x86intrin.h>
1966///
1967/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
1968///
1969/// \param __a
1970/// A 256-bit vector of [8 x float].
1971/// \returns A 256-bit integer vector containing the converted values.
1972static __inline __m256i __DEFAULT_FN_ATTRS
1974{
1975 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
1976}
1977
1978/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
1979/// x double].
1980///
1981/// \headerfile <x86intrin.h>
1982///
1983/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
1984///
1985/// \param __a
1986/// A 128-bit vector of [4 x float].
1987/// \returns A 256-bit vector of [4 x double] containing the converted values.
1988static __inline __m256d __DEFAULT_FN_ATTRS
1990{
1991 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
1992}
1993
1994/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
1995/// x i32], truncating the result by rounding towards zero when it is
1996/// inexact.
1997///
1998/// \headerfile <x86intrin.h>
1999///
2000/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2001///
2002/// \param __a
2003/// A 256-bit vector of [4 x double].
2004/// \returns A 128-bit integer vector containing the converted values.
2005static __inline __m128i __DEFAULT_FN_ATTRS
2007{
2008 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2009}
2010
2011/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2012/// x i32]. When a conversion is inexact, the value returned is rounded
2013/// according to the rounding control bits in the MXCSR register.
2014///
2015/// \headerfile <x86intrin.h>
2016///
2017/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2018///
2019/// \param __a
2020/// A 256-bit vector of [4 x double].
2021/// \returns A 128-bit integer vector containing the converted values.
2022static __inline __m128i __DEFAULT_FN_ATTRS
2024{
2025 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2026}
2027
2028/// Converts a vector of [8 x float] into a vector of [8 x i32],
2029/// truncating the result by rounding towards zero when it is inexact.
2030///
2031/// \headerfile <x86intrin.h>
2032///
2033/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2034///
2035/// \param __a
2036/// A 256-bit vector of [8 x float].
2037/// \returns A 256-bit integer vector containing the converted values.
2038static __inline __m256i __DEFAULT_FN_ATTRS
2040{
2041 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2042}
2043
2044/// Returns the first element of the input vector of [4 x double].
2045///
2046/// \headerfile <x86intrin.h>
2047///
2048/// This intrinsic is a utility function and does not correspond to a specific
2049/// instruction.
2050///
2051/// \param __a
2052/// A 256-bit vector of [4 x double].
2053/// \returns A 64 bit double containing the first element of the input vector.
2054static __inline double __DEFAULT_FN_ATTRS
2056{
2057 return __a[0];
2058}
2059
2060/// Returns the first element of the input vector of [8 x i32].
2061///
2062/// \headerfile <x86intrin.h>
2063///
2064/// This intrinsic is a utility function and does not correspond to a specific
2065/// instruction.
2066///
2067/// \param __a
2068/// A 256-bit vector of [8 x i32].
2069/// \returns A 32 bit integer containing the first element of the input vector.
2070static __inline int __DEFAULT_FN_ATTRS
2072{
2073 __v8si __b = (__v8si)__a;
2074 return __b[0];
2075}
2076
2077/// Returns the first element of the input vector of [8 x float].
2078///
2079/// \headerfile <x86intrin.h>
2080///
2081/// This intrinsic is a utility function and does not correspond to a specific
2082/// instruction.
2083///
2084/// \param __a
2085/// A 256-bit vector of [8 x float].
2086/// \returns A 32 bit float containing the first element of the input vector.
2087static __inline float __DEFAULT_FN_ATTRS
2089{
2090 return __a[0];
2091}
2092
2093/* Vector replicate */
2094/// Moves and duplicates odd-indexed values from a 256-bit vector of
2095/// [8 x float] to float values in a 256-bit vector of [8 x float].
2096///
2097/// \headerfile <x86intrin.h>
2098///
2099/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2100///
2101/// \param __a
2102/// A 256-bit vector of [8 x float]. \n
2103/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2104/// the return value. \n
2105/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2106/// the return value. \n
2107/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2108/// return value. \n
2109/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2110/// return value.
2111/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2112/// values.
2113static __inline __m256 __DEFAULT_FN_ATTRS
2115{
2116 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2117}
2118
2119/// Moves and duplicates even-indexed values from a 256-bit vector of
2120/// [8 x float] to float values in a 256-bit vector of [8 x float].
2121///
2122/// \headerfile <x86intrin.h>
2123///
2124/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2125///
2126/// \param __a
2127/// A 256-bit vector of [8 x float]. \n
2128/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2129/// the return value. \n
2130/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2131/// the return value. \n
2132/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2133/// return value. \n
2134/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2135/// return value.
2136/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2137/// values.
2138static __inline __m256 __DEFAULT_FN_ATTRS
2140{
2141 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2142}
2143
2144/// Moves and duplicates double-precision floating point values from a
2145/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2146/// vector of [4 x double].
2147///
2148/// \headerfile <x86intrin.h>
2149///
2150/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2151///
2152/// \param __a
2153/// A 256-bit vector of [4 x double]. \n
2154/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2155/// return value. \n
2156/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2157/// the return value.
2158/// \returns A 256-bit vector of [4 x double] containing the moved and
2159/// duplicated values.
2160static __inline __m256d __DEFAULT_FN_ATTRS
2162{
2163 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2164}
2165
2166/* Unpack and Interleave */
2167/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2168/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2169///
2170/// \headerfile <x86intrin.h>
2171///
2172/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2173///
2174/// \param __a
2175/// A 256-bit floating-point vector of [4 x double]. \n
2176/// Bits [127:64] are written to bits [63:0] of the return value. \n
2177/// Bits [255:192] are written to bits [191:128] of the return value. \n
2178/// \param __b
2179/// A 256-bit floating-point vector of [4 x double]. \n
2180/// Bits [127:64] are written to bits [127:64] of the return value. \n
2181/// Bits [255:192] are written to bits [255:192] of the return value. \n
2182/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2183static __inline __m256d __DEFAULT_FN_ATTRS
2184_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2185{
2186 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2187}
2188
2189/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2190/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2191///
2192/// \headerfile <x86intrin.h>
2193///
2194/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2195///
2196/// \param __a
2197/// A 256-bit floating-point vector of [4 x double]. \n
2198/// Bits [63:0] are written to bits [63:0] of the return value. \n
2199/// Bits [191:128] are written to bits [191:128] of the return value.
2200/// \param __b
2201/// A 256-bit floating-point vector of [4 x double]. \n
2202/// Bits [63:0] are written to bits [127:64] of the return value. \n
2203/// Bits [191:128] are written to bits [255:192] of the return value. \n
2204/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2205static __inline __m256d __DEFAULT_FN_ATTRS
2206_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2207{
2208 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2209}
2210
2211/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2212/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2213/// vector of [8 x float].
2214///
2215/// \headerfile <x86intrin.h>
2216///
2217/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2218///
2219/// \param __a
2220/// A 256-bit vector of [8 x float]. \n
2221/// Bits [95:64] are written to bits [31:0] of the return value. \n
2222/// Bits [127:96] are written to bits [95:64] of the return value. \n
2223/// Bits [223:192] are written to bits [159:128] of the return value. \n
2224/// Bits [255:224] are written to bits [223:192] of the return value.
2225/// \param __b
2226/// A 256-bit vector of [8 x float]. \n
2227/// Bits [95:64] are written to bits [63:32] of the return value. \n
2228/// Bits [127:96] are written to bits [127:96] of the return value. \n
2229/// Bits [223:192] are written to bits [191:160] of the return value. \n
2230/// Bits [255:224] are written to bits [255:224] of the return value.
2231/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2232static __inline __m256 __DEFAULT_FN_ATTRS
2234{
2235 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2236}
2237
2238/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2239/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2240/// vector of [8 x float].
2241///
2242/// \headerfile <x86intrin.h>
2243///
2244/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2245///
2246/// \param __a
2247/// A 256-bit vector of [8 x float]. \n
2248/// Bits [31:0] are written to bits [31:0] of the return value. \n
2249/// Bits [63:32] are written to bits [95:64] of the return value. \n
2250/// Bits [159:128] are written to bits [159:128] of the return value. \n
2251/// Bits [191:160] are written to bits [223:192] of the return value.
2252/// \param __b
2253/// A 256-bit vector of [8 x float]. \n
2254/// Bits [31:0] are written to bits [63:32] of the return value. \n
2255/// Bits [63:32] are written to bits [127:96] of the return value. \n
2256/// Bits [159:128] are written to bits [191:160] of the return value. \n
2257/// Bits [191:160] are written to bits [255:224] of the return value.
2258/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2259static __inline __m256 __DEFAULT_FN_ATTRS
2261{
2262 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2263}
2264
2265/* Bit Test */
2266/// Given two 128-bit floating-point vectors of [2 x double], perform an
2267/// element-by-element comparison of the double-precision element in the
2268/// first source vector and the corresponding element in the second source
2269/// vector.
2270///
2271/// The EFLAGS register is updated as follows: \n
2272/// If there is at least one pair of double-precision elements where the
2273/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2274/// ZF flag is set to 1. \n
2275/// If there is at least one pair of double-precision elements where the
2276/// sign-bit of the first element is 0 and the sign-bit of the second element
2277/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2278/// This intrinsic returns the value of the ZF flag.
2279///
2280/// \headerfile <x86intrin.h>
2281///
2282/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2283///
2284/// \param __a
2285/// A 128-bit vector of [2 x double].
2286/// \param __b
2287/// A 128-bit vector of [2 x double].
2288/// \returns the ZF flag in the EFLAGS register.
2289static __inline int __DEFAULT_FN_ATTRS128
2290_mm_testz_pd(__m128d __a, __m128d __b)
2291{
2292 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2293}
2294
2295/// Given two 128-bit floating-point vectors of [2 x double], perform an
2296/// element-by-element comparison of the double-precision element in the
2297/// first source vector and the corresponding element in the second source
2298/// vector.
2299///
2300/// The EFLAGS register is updated as follows: \n
2301/// If there is at least one pair of double-precision elements where the
2302/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2303/// ZF flag is set to 1. \n
2304/// If there is at least one pair of double-precision elements where the
2305/// sign-bit of the first element is 0 and the sign-bit of the second element
2306/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2307/// This intrinsic returns the value of the CF flag.
2308///
2309/// \headerfile <x86intrin.h>
2310///
2311/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2312///
2313/// \param __a
2314/// A 128-bit vector of [2 x double].
2315/// \param __b
2316/// A 128-bit vector of [2 x double].
2317/// \returns the CF flag in the EFLAGS register.
2318static __inline int __DEFAULT_FN_ATTRS128
2319_mm_testc_pd(__m128d __a, __m128d __b)
2320{
2321 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2322}
2323
2324/// Given two 128-bit floating-point vectors of [2 x double], perform an
2325/// element-by-element comparison of the double-precision element in the
2326/// first source vector and the corresponding element in the second source
2327/// vector.
2328///
2329/// The EFLAGS register is updated as follows: \n
2330/// If there is at least one pair of double-precision elements where the
2331/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2332/// ZF flag is set to 1. \n
2333/// If there is at least one pair of double-precision elements where the
2334/// sign-bit of the first element is 0 and the sign-bit of the second element
2335/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2336/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2337/// otherwise it returns 0.
2338///
2339/// \headerfile <x86intrin.h>
2340///
2341/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2342///
2343/// \param __a
2344/// A 128-bit vector of [2 x double].
2345/// \param __b
2346/// A 128-bit vector of [2 x double].
2347/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2348static __inline int __DEFAULT_FN_ATTRS128
2349_mm_testnzc_pd(__m128d __a, __m128d __b)
2350{
2351 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2352}
2353
2354/// Given two 128-bit floating-point vectors of [4 x float], perform an
2355/// element-by-element comparison of the single-precision element in the
2356/// first source vector and the corresponding element in the second source
2357/// vector.
2358///
2359/// The EFLAGS register is updated as follows: \n
2360/// If there is at least one pair of single-precision elements where the
2361/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2362/// ZF flag is set to 1. \n
2363/// If there is at least one pair of single-precision elements where the
2364/// sign-bit of the first element is 0 and the sign-bit of the second element
2365/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2366/// This intrinsic returns the value of the ZF flag.
2367///
2368/// \headerfile <x86intrin.h>
2369///
2370/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2371///
2372/// \param __a
2373/// A 128-bit vector of [4 x float].
2374/// \param __b
2375/// A 128-bit vector of [4 x float].
2376/// \returns the ZF flag.
2377static __inline int __DEFAULT_FN_ATTRS128
2378_mm_testz_ps(__m128 __a, __m128 __b)
2379{
2380 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2381}
2382
2383/// Given two 128-bit floating-point vectors of [4 x float], perform an
2384/// element-by-element comparison of the single-precision element in the
2385/// first source vector and the corresponding element in the second source
2386/// vector.
2387///
2388/// The EFLAGS register is updated as follows: \n
2389/// If there is at least one pair of single-precision elements where the
2390/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2391/// ZF flag is set to 1. \n
2392/// If there is at least one pair of single-precision elements where the
2393/// sign-bit of the first element is 0 and the sign-bit of the second element
2394/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2395/// This intrinsic returns the value of the CF flag.
2396///
2397/// \headerfile <x86intrin.h>
2398///
2399/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2400///
2401/// \param __a
2402/// A 128-bit vector of [4 x float].
2403/// \param __b
2404/// A 128-bit vector of [4 x float].
2405/// \returns the CF flag.
2406static __inline int __DEFAULT_FN_ATTRS128
2407_mm_testc_ps(__m128 __a, __m128 __b)
2408{
2409 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2410}
2411
2412/// Given two 128-bit floating-point vectors of [4 x float], perform an
2413/// element-by-element comparison of the single-precision element in the
2414/// first source vector and the corresponding element in the second source
2415/// vector.
2416///
2417/// The EFLAGS register is updated as follows: \n
2418/// If there is at least one pair of single-precision elements where the
2419/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2420/// ZF flag is set to 1. \n
2421/// If there is at least one pair of single-precision elements where the
2422/// sign-bit of the first element is 0 and the sign-bit of the second element
2423/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2424/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2425/// otherwise it returns 0.
2426///
2427/// \headerfile <x86intrin.h>
2428///
2429/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2430///
2431/// \param __a
2432/// A 128-bit vector of [4 x float].
2433/// \param __b
2434/// A 128-bit vector of [4 x float].
2435/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2436static __inline int __DEFAULT_FN_ATTRS128
2437_mm_testnzc_ps(__m128 __a, __m128 __b)
2438{
2439 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2440}
2441
2442/// Given two 256-bit floating-point vectors of [4 x double], perform an
2443/// element-by-element comparison of the double-precision elements in the
2444/// first source vector and the corresponding elements in the second source
2445/// vector.
2446///
2447/// The EFLAGS register is updated as follows: \n
2448/// If there is at least one pair of double-precision elements where the
2449/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2450/// ZF flag is set to 1. \n
2451/// If there is at least one pair of double-precision elements where the
2452/// sign-bit of the first element is 0 and the sign-bit of the second element
2453/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2454/// This intrinsic returns the value of the ZF flag.
2455///
2456/// \headerfile <x86intrin.h>
2457///
2458/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2459///
2460/// \param __a
2461/// A 256-bit vector of [4 x double].
2462/// \param __b
2463/// A 256-bit vector of [4 x double].
2464/// \returns the ZF flag.
2465static __inline int __DEFAULT_FN_ATTRS
2466_mm256_testz_pd(__m256d __a, __m256d __b)
2467{
2468 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2469}
2470
2471/// Given two 256-bit floating-point vectors of [4 x double], perform an
2472/// element-by-element comparison of the double-precision elements in the
2473/// first source vector and the corresponding elements in the second source
2474/// vector.
2475///
2476/// The EFLAGS register is updated as follows: \n
2477/// If there is at least one pair of double-precision elements where the
2478/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2479/// ZF flag is set to 1. \n
2480/// If there is at least one pair of double-precision elements where the
2481/// sign-bit of the first element is 0 and the sign-bit of the second element
2482/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2483/// This intrinsic returns the value of the CF flag.
2484///
2485/// \headerfile <x86intrin.h>
2486///
2487/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2488///
2489/// \param __a
2490/// A 256-bit vector of [4 x double].
2491/// \param __b
2492/// A 256-bit vector of [4 x double].
2493/// \returns the CF flag.
2494static __inline int __DEFAULT_FN_ATTRS
2495_mm256_testc_pd(__m256d __a, __m256d __b)
2496{
2497 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2498}
2499
2500/// Given two 256-bit floating-point vectors of [4 x double], perform an
2501/// element-by-element comparison of the double-precision elements in the
2502/// first source vector and the corresponding elements in the second source
2503/// vector.
2504///
2505/// The EFLAGS register is updated as follows: \n
2506/// If there is at least one pair of double-precision elements where the
2507/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2508/// ZF flag is set to 1. \n
2509/// If there is at least one pair of double-precision elements where the
2510/// sign-bit of the first element is 0 and the sign-bit of the second element
2511/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2512/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2513/// otherwise it returns 0.
2514///
2515/// \headerfile <x86intrin.h>
2516///
2517/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2518///
2519/// \param __a
2520/// A 256-bit vector of [4 x double].
2521/// \param __b
2522/// A 256-bit vector of [4 x double].
2523/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2524static __inline int __DEFAULT_FN_ATTRS
2525_mm256_testnzc_pd(__m256d __a, __m256d __b)
2526{
2527 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2528}
2529
2530/// Given two 256-bit floating-point vectors of [8 x float], perform an
2531/// element-by-element comparison of the single-precision element in the
2532/// first source vector and the corresponding element in the second source
2533/// vector.
2534///
2535/// The EFLAGS register is updated as follows: \n
2536/// If there is at least one pair of single-precision elements where the
2537/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2538/// ZF flag is set to 1. \n
2539/// If there is at least one pair of single-precision elements where the
2540/// sign-bit of the first element is 0 and the sign-bit of the second element
2541/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2542/// This intrinsic returns the value of the ZF flag.
2543///
2544/// \headerfile <x86intrin.h>
2545///
2546/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2547///
2548/// \param __a
2549/// A 256-bit vector of [8 x float].
2550/// \param __b
2551/// A 256-bit vector of [8 x float].
2552/// \returns the ZF flag.
2553static __inline int __DEFAULT_FN_ATTRS
2554_mm256_testz_ps(__m256 __a, __m256 __b)
2555{
2556 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2557}
2558
2559/// Given two 256-bit floating-point vectors of [8 x float], perform an
2560/// element-by-element comparison of the single-precision element in the
2561/// first source vector and the corresponding element in the second source
2562/// vector.
2563///
2564/// The EFLAGS register is updated as follows: \n
2565/// If there is at least one pair of single-precision elements where the
2566/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2567/// ZF flag is set to 1. \n
2568/// If there is at least one pair of single-precision elements where the
2569/// sign-bit of the first element is 0 and the sign-bit of the second element
2570/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2571/// This intrinsic returns the value of the CF flag.
2572///
2573/// \headerfile <x86intrin.h>
2574///
2575/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2576///
2577/// \param __a
2578/// A 256-bit vector of [8 x float].
2579/// \param __b
2580/// A 256-bit vector of [8 x float].
2581/// \returns the CF flag.
2582static __inline int __DEFAULT_FN_ATTRS
2583_mm256_testc_ps(__m256 __a, __m256 __b)
2584{
2585 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2586}
2587
2588/// Given two 256-bit floating-point vectors of [8 x float], perform an
2589/// element-by-element comparison of the single-precision elements in the
2590/// first source vector and the corresponding elements in the second source
2591/// vector.
2592///
2593/// The EFLAGS register is updated as follows: \n
2594/// If there is at least one pair of single-precision elements where the
2595/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2596/// ZF flag is set to 1. \n
2597/// If there is at least one pair of single-precision elements where the
2598/// sign-bit of the first element is 0 and the sign-bit of the second element
2599/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2600/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2601/// otherwise it returns 0.
2602///
2603/// \headerfile <x86intrin.h>
2604///
2605/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2606///
2607/// \param __a
2608/// A 256-bit vector of [8 x float].
2609/// \param __b
2610/// A 256-bit vector of [8 x float].
2611/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2612static __inline int __DEFAULT_FN_ATTRS
2614{
2615 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2616}
2617
2618/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2619/// of the two source vectors.
2620///
2621/// The EFLAGS register is updated as follows: \n
2622/// If there is at least one pair of bits where both bits are 1, the ZF flag
2623/// is set to 0. Otherwise the ZF flag is set to 1. \n
2624/// If there is at least one pair of bits where the bit from the first source
2625/// vector is 0 and the bit from the second source vector is 1, the CF flag
2626/// is set to 0. Otherwise the CF flag is set to 1. \n
2627/// This intrinsic returns the value of the ZF flag.
2628///
2629/// \headerfile <x86intrin.h>
2630///
2631/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2632///
2633/// \param __a
2634/// A 256-bit integer vector.
2635/// \param __b
2636/// A 256-bit integer vector.
2637/// \returns the ZF flag.
2638static __inline int __DEFAULT_FN_ATTRS
2639_mm256_testz_si256(__m256i __a, __m256i __b)
2640{
2641 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2642}
2643
2644/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2645/// of the two source vectors.
2646///
2647/// The EFLAGS register is updated as follows: \n
2648/// If there is at least one pair of bits where both bits are 1, the ZF flag
2649/// is set to 0. Otherwise the ZF flag is set to 1. \n
2650/// If there is at least one pair of bits where the bit from the first source
2651/// vector is 0 and the bit from the second source vector is 1, the CF flag
2652/// is set to 0. Otherwise the CF flag is set to 1. \n
2653/// This intrinsic returns the value of the CF flag.
2654///
2655/// \headerfile <x86intrin.h>
2656///
2657/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2658///
2659/// \param __a
2660/// A 256-bit integer vector.
2661/// \param __b
2662/// A 256-bit integer vector.
2663/// \returns the CF flag.
2664static __inline int __DEFAULT_FN_ATTRS
2665_mm256_testc_si256(__m256i __a, __m256i __b)
2666{
2667 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2668}
2669
2670/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2671/// of the two source vectors.
2672///
2673/// The EFLAGS register is updated as follows: \n
2674/// If there is at least one pair of bits where both bits are 1, the ZF flag
2675/// is set to 0. Otherwise the ZF flag is set to 1. \n
2676/// If there is at least one pair of bits where the bit from the first source
2677/// vector is 0 and the bit from the second source vector is 1, the CF flag
2678/// is set to 0. Otherwise the CF flag is set to 1. \n
2679/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2680/// otherwise it returns 0.
2681///
2682/// \headerfile <x86intrin.h>
2683///
2684/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2685///
2686/// \param __a
2687/// A 256-bit integer vector.
2688/// \param __b
2689/// A 256-bit integer vector.
2690/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2691static __inline int __DEFAULT_FN_ATTRS
2693{
2694 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2695}
2696
2697/* Vector extract sign mask */
2698/// Extracts the sign bits of double-precision floating point elements
2699/// in a 256-bit vector of [4 x double] and writes them to the lower order
2700/// bits of the return value.
2701///
2702/// \headerfile <x86intrin.h>
2703///
2704/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2705///
2706/// \param __a
2707/// A 256-bit vector of [4 x double] containing the double-precision
2708/// floating point values with sign bits to be extracted.
2709/// \returns The sign bits from the operand, written to bits [3:0].
2710static __inline int __DEFAULT_FN_ATTRS
2712{
2713 return __builtin_ia32_movmskpd256((__v4df)__a);
2714}
2715
2716/// Extracts the sign bits of single-precision floating point elements
2717/// in a 256-bit vector of [8 x float] and writes them to the lower order
2718/// bits of the return value.
2719///
2720/// \headerfile <x86intrin.h>
2721///
2722/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2723///
2724/// \param __a
2725/// A 256-bit vector of [8 x float] containing the single-precision floating
2726/// point values with sign bits to be extracted.
2727/// \returns The sign bits from the operand, written to bits [7:0].
2728static __inline int __DEFAULT_FN_ATTRS
2730{
2731 return __builtin_ia32_movmskps256((__v8sf)__a);
2732}
2733
2734/* Vector __zero */
2735/// Zeroes the contents of all XMM or YMM registers.
2736///
2737/// \headerfile <x86intrin.h>
2738///
2739/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2740static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2741_mm256_zeroall(void)
2742{
2743 __builtin_ia32_vzeroall();
2744}
2745
2746/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2747///
2748/// \headerfile <x86intrin.h>
2749///
2750/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2751static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2752_mm256_zeroupper(void)
2753{
2754 __builtin_ia32_vzeroupper();
2755}
2756
2757/* Vector load with broadcast */
2758/// Loads a scalar single-precision floating point value from the
2759/// specified address pointed to by \a __a and broadcasts it to the elements
2760/// of a [4 x float] vector.
2761///
2762/// \headerfile <x86intrin.h>
2763///
2764/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2765///
2766/// \param __a
2767/// The single-precision floating point value to be broadcast.
2768/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2769/// equal to the broadcast value.
2770static __inline __m128 __DEFAULT_FN_ATTRS128
2772{
2773 struct __mm_broadcast_ss_struct {
2774 float __f;
2775 } __attribute__((__packed__, __may_alias__));
2776 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
2777 return __extension__ (__m128){ __f, __f, __f, __f };
2778}
2779
2780/// Loads a scalar double-precision floating point value from the
2781/// specified address pointed to by \a __a and broadcasts it to the elements
2782/// of a [4 x double] vector.
2783///
2784/// \headerfile <x86intrin.h>
2785///
2786/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
2787///
2788/// \param __a
2789/// The double-precision floating point value to be broadcast.
2790/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
2791/// equal to the broadcast value.
2792static __inline __m256d __DEFAULT_FN_ATTRS
2794{
2795 struct __mm256_broadcast_sd_struct {
2796 double __d;
2797 } __attribute__((__packed__, __may_alias__));
2798 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
2799 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
2800}
2801
2802/// Loads a scalar single-precision floating point value from the
2803/// specified address pointed to by \a __a and broadcasts it to the elements
2804/// of a [8 x float] vector.
2805///
2806/// \headerfile <x86intrin.h>
2807///
2808/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2809///
2810/// \param __a
2811/// The single-precision floating point value to be broadcast.
2812/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
2813/// equal to the broadcast value.
2814static __inline __m256 __DEFAULT_FN_ATTRS
2816{
2817 struct __mm256_broadcast_ss_struct {
2818 float __f;
2819 } __attribute__((__packed__, __may_alias__));
2820 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
2821 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
2822}
2823
2824/// Loads the data from a 128-bit vector of [2 x double] from the
2825/// specified address pointed to by \a __a and broadcasts it to 128-bit
2826/// elements in a 256-bit vector of [4 x double].
2827///
2828/// \headerfile <x86intrin.h>
2829///
2830/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
2831///
2832/// \param __a
2833/// The 128-bit vector of [2 x double] to be broadcast.
2834/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
2835/// equal to the broadcast value.
2836static __inline __m256d __DEFAULT_FN_ATTRS
2838{
2839 __m128d __b = _mm_loadu_pd((const double *)__a);
2840 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
2841 0, 1, 0, 1);
2842}
2843
2844/// Loads the data from a 128-bit vector of [4 x float] from the
2845/// specified address pointed to by \a __a and broadcasts it to 128-bit
2846/// elements in a 256-bit vector of [8 x float].
2847///
2848/// \headerfile <x86intrin.h>
2849///
2850/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
2851///
2852/// \param __a
2853/// The 128-bit vector of [4 x float] to be broadcast.
2854/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
2855/// equal to the broadcast value.
2856static __inline __m256 __DEFAULT_FN_ATTRS
2858{
2859 __m128 __b = _mm_loadu_ps((const float *)__a);
2860 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
2861 0, 1, 2, 3, 0, 1, 2, 3);
2862}
2863
2864/* SIMD load ops */
2865/// Loads 4 double-precision floating point values from a 32-byte aligned
2866/// memory location pointed to by \a __p into a vector of [4 x double].
2867///
2868/// \headerfile <x86intrin.h>
2869///
2870/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
2871///
2872/// \param __p
2873/// A 32-byte aligned pointer to a memory location containing
2874/// double-precision floating point values.
2875/// \returns A 256-bit vector of [4 x double] containing the moved values.
2876static __inline __m256d __DEFAULT_FN_ATTRS
2877_mm256_load_pd(double const *__p)
2878{
2879 return *(const __m256d *)__p;
2880}
2881
2882/// Loads 8 single-precision floating point values from a 32-byte aligned
2883/// memory location pointed to by \a __p into a vector of [8 x float].
2884///
2885/// \headerfile <x86intrin.h>
2886///
2887/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
2888///
2889/// \param __p
2890/// A 32-byte aligned pointer to a memory location containing float values.
2891/// \returns A 256-bit vector of [8 x float] containing the moved values.
2892static __inline __m256 __DEFAULT_FN_ATTRS
2893_mm256_load_ps(float const *__p)
2894{
2895 return *(const __m256 *)__p;
2896}
2897
2898/// Loads 4 double-precision floating point values from an unaligned
2899/// memory location pointed to by \a __p into a vector of [4 x double].
2900///
2901/// \headerfile <x86intrin.h>
2902///
2903/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
2904///
2905/// \param __p
2906/// A pointer to a memory location containing double-precision floating
2907/// point values.
2908/// \returns A 256-bit vector of [4 x double] containing the moved values.
2909static __inline __m256d __DEFAULT_FN_ATTRS
2910_mm256_loadu_pd(double const *__p)
2911{
2912 struct __loadu_pd {
2913 __m256d_u __v;
2914 } __attribute__((__packed__, __may_alias__));
2915 return ((const struct __loadu_pd*)__p)->__v;
2916}
2917
2918/// Loads 8 single-precision floating point values from an unaligned
2919/// memory location pointed to by \a __p into a vector of [8 x float].
2920///
2921/// \headerfile <x86intrin.h>
2922///
2923/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
2924///
2925/// \param __p
2926/// A pointer to a memory location containing single-precision floating
2927/// point values.
2928/// \returns A 256-bit vector of [8 x float] containing the moved values.
2929static __inline __m256 __DEFAULT_FN_ATTRS
2931{
2932 struct __loadu_ps {
2933 __m256_u __v;
2934 } __attribute__((__packed__, __may_alias__));
2935 return ((const struct __loadu_ps*)__p)->__v;
2936}
2937
2938/// Loads 256 bits of integer data from a 32-byte aligned memory
2939/// location pointed to by \a __p into elements of a 256-bit integer vector.
2940///
2941/// \headerfile <x86intrin.h>
2942///
2943/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
2944///
2945/// \param __p
2946/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
2947/// values.
2948/// \returns A 256-bit integer vector containing the moved values.
2949static __inline __m256i __DEFAULT_FN_ATTRS
2950_mm256_load_si256(__m256i const *__p)
2951{
2952 return *__p;
2953}
2954
2955/// Loads 256 bits of integer data from an unaligned memory location
2956/// pointed to by \a __p into a 256-bit integer vector.
2957///
2958/// \headerfile <x86intrin.h>
2959///
2960/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
2961///
2962/// \param __p
2963/// A pointer to a 256-bit integer vector containing integer values.
2964/// \returns A 256-bit integer vector containing the moved values.
2965static __inline __m256i __DEFAULT_FN_ATTRS
2966_mm256_loadu_si256(__m256i_u const *__p)
2967{
2968 struct __loadu_si256 {
2969 __m256i_u __v;
2970 } __attribute__((__packed__, __may_alias__));
2971 return ((const struct __loadu_si256*)__p)->__v;
2972}
2973
2974/// Loads 256 bits of integer data from an unaligned memory location
2975/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
2976/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
2977/// line boundary.
2978///
2979/// \headerfile <x86intrin.h>
2980///
2981/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
2982///
2983/// \param __p
2984/// A pointer to a 256-bit integer vector containing integer values.
2985/// \returns A 256-bit integer vector containing the moved values.
2986static __inline __m256i __DEFAULT_FN_ATTRS
2987_mm256_lddqu_si256(__m256i_u const *__p)
2988{
2989 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
2990}
2991
2992/* SIMD store ops */
2993/// Stores double-precision floating point values from a 256-bit vector
2994/// of [4 x double] to a 32-byte aligned memory location pointed to by
2995/// \a __p.
2996///
2997/// \headerfile <x86intrin.h>
2998///
2999/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3000///
3001/// \param __p
3002/// A 32-byte aligned pointer to a memory location that will receive the
3003/// double-precision floaing point values.
3004/// \param __a
3005/// A 256-bit vector of [4 x double] containing the values to be moved.
3006static __inline void __DEFAULT_FN_ATTRS
3007_mm256_store_pd(double *__p, __m256d __a)
3008{
3009 *(__m256d *)__p = __a;
3010}
3011
3012/// Stores single-precision floating point values from a 256-bit vector
3013/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3014///
3015/// \headerfile <x86intrin.h>
3016///
3017/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3018///
3019/// \param __p
3020/// A 32-byte aligned pointer to a memory location that will receive the
3021/// float values.
3022/// \param __a
3023/// A 256-bit vector of [8 x float] containing the values to be moved.
3024static __inline void __DEFAULT_FN_ATTRS
3025_mm256_store_ps(float *__p, __m256 __a)
3026{
3027 *(__m256 *)__p = __a;
3028}
3029
3030/// Stores double-precision floating point values from a 256-bit vector
3031/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3032///
3033/// \headerfile <x86intrin.h>
3034///
3035/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3036///
3037/// \param __p
3038/// A pointer to a memory location that will receive the double-precision
3039/// floating point values.
3040/// \param __a
3041/// A 256-bit vector of [4 x double] containing the values to be moved.
3042static __inline void __DEFAULT_FN_ATTRS
3043_mm256_storeu_pd(double *__p, __m256d __a)
3044{
3045 struct __storeu_pd {
3046 __m256d_u __v;
3047 } __attribute__((__packed__, __may_alias__));
3048 ((struct __storeu_pd*)__p)->__v = __a;
3049}
3050
3051/// Stores single-precision floating point values from a 256-bit vector
3052/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3053///
3054/// \headerfile <x86intrin.h>
3055///
3056/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3057///
3058/// \param __p
3059/// A pointer to a memory location that will receive the float values.
3060/// \param __a
3061/// A 256-bit vector of [8 x float] containing the values to be moved.
3062static __inline void __DEFAULT_FN_ATTRS
3063_mm256_storeu_ps(float *__p, __m256 __a)
3064{
3065 struct __storeu_ps {
3066 __m256_u __v;
3067 } __attribute__((__packed__, __may_alias__));
3068 ((struct __storeu_ps*)__p)->__v = __a;
3069}
3070
3071/// Stores integer values from a 256-bit integer vector to a 32-byte
3072/// aligned memory location pointed to by \a __p.
3073///
3074/// \headerfile <x86intrin.h>
3075///
3076/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3077///
3078/// \param __p
3079/// A 32-byte aligned pointer to a memory location that will receive the
3080/// integer values.
3081/// \param __a
3082/// A 256-bit integer vector containing the values to be moved.
3083static __inline void __DEFAULT_FN_ATTRS
3084_mm256_store_si256(__m256i *__p, __m256i __a)
3085{
3086 *__p = __a;
3087}
3088
3089/// Stores integer values from a 256-bit integer vector to an unaligned
3090/// memory location pointed to by \a __p.
3091///
3092/// \headerfile <x86intrin.h>
3093///
3094/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3095///
3096/// \param __p
3097/// A pointer to a memory location that will receive the integer values.
3098/// \param __a
3099/// A 256-bit integer vector containing the values to be moved.
3100static __inline void __DEFAULT_FN_ATTRS
3101_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3102{
3103 struct __storeu_si256 {
3104 __m256i_u __v;
3105 } __attribute__((__packed__, __may_alias__));
3106 ((struct __storeu_si256*)__p)->__v = __a;
3107}
3108
3109/* Conditional load ops */
3110/// Conditionally loads double-precision floating point elements from a
3111/// memory location pointed to by \a __p into a 128-bit vector of
3112/// [2 x double], depending on the mask bits associated with each data
3113/// element.
3114///
3115/// \headerfile <x86intrin.h>
3116///
3117/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3118///
3119/// \param __p
3120/// A pointer to a memory location that contains the double-precision
3121/// floating point values.
3122/// \param __m
3123/// A 128-bit integer vector containing the mask. The most significant bit of
3124/// each data element represents the mask bits. If a mask bit is zero, the
3125/// corresponding value in the memory location is not loaded and the
3126/// corresponding field in the return value is set to zero.
3127/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3128static __inline __m128d __DEFAULT_FN_ATTRS128
3129_mm_maskload_pd(double const *__p, __m128i __m)
3130{
3131 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3132}
3133
3134/// Conditionally loads double-precision floating point elements from a
3135/// memory location pointed to by \a __p into a 256-bit vector of
3136/// [4 x double], depending on the mask bits associated with each data
3137/// element.
3138///
3139/// \headerfile <x86intrin.h>
3140///
3141/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3142///
3143/// \param __p
3144/// A pointer to a memory location that contains the double-precision
3145/// floating point values.
3146/// \param __m
3147/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3148/// significant bit of each quadword element represents the mask bits. If a
3149/// mask bit is zero, the corresponding value in the memory location is not
3150/// loaded and the corresponding field in the return value is set to zero.
3151/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3152static __inline __m256d __DEFAULT_FN_ATTRS
3153_mm256_maskload_pd(double const *__p, __m256i __m)
3154{
3155 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3156 (__v4di)__m);
3157}
3158
3159/// Conditionally loads single-precision floating point elements from a
3160/// memory location pointed to by \a __p into a 128-bit vector of
3161/// [4 x float], depending on the mask bits associated with each data
3162/// element.
3163///
3164/// \headerfile <x86intrin.h>
3165///
3166/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3167///
3168/// \param __p
3169/// A pointer to a memory location that contains the single-precision
3170/// floating point values.
3171/// \param __m
3172/// A 128-bit integer vector containing the mask. The most significant bit of
3173/// each data element represents the mask bits. If a mask bit is zero, the
3174/// corresponding value in the memory location is not loaded and the
3175/// corresponding field in the return value is set to zero.
3176/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3177static __inline __m128 __DEFAULT_FN_ATTRS128
3178_mm_maskload_ps(float const *__p, __m128i __m)
3179{
3180 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3181}
3182
3183/// Conditionally loads single-precision floating point elements from a
3184/// memory location pointed to by \a __p into a 256-bit vector of
3185/// [8 x float], depending on the mask bits associated with each data
3186/// element.
3187///
3188/// \headerfile <x86intrin.h>
3189///
3190/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3191///
3192/// \param __p
3193/// A pointer to a memory location that contains the single-precision
3194/// floating point values.
3195/// \param __m
3196/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3197/// significant bit of each dword element represents the mask bits. If a mask
3198/// bit is zero, the corresponding value in the memory location is not loaded
3199/// and the corresponding field in the return value is set to zero.
3200/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3201static __inline __m256 __DEFAULT_FN_ATTRS
3202_mm256_maskload_ps(float const *__p, __m256i __m)
3203{
3204 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3205}
3206
3207/* Conditional store ops */
3208/// Moves single-precision floating point values from a 256-bit vector
3209/// of [8 x float] to a memory location pointed to by \a __p, according to
3210/// the specified mask.
3211///
3212/// \headerfile <x86intrin.h>
3213///
3214/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3215///
3216/// \param __p
3217/// A pointer to a memory location that will receive the float values.
3218/// \param __m
3219/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3220/// significant bit of each dword element in the mask vector represents the
3221/// mask bits. If a mask bit is zero, the corresponding value from vector
3222/// \a __a is not stored and the corresponding field in the memory location
3223/// pointed to by \a __p is not changed.
3224/// \param __a
3225/// A 256-bit vector of [8 x float] containing the values to be stored.
3226static __inline void __DEFAULT_FN_ATTRS
3227_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3228{
3229 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3230}
3231
3232/// Moves double-precision values from a 128-bit vector of [2 x double]
3233/// to a memory location pointed to by \a __p, according to the specified
3234/// mask.
3235///
3236/// \headerfile <x86intrin.h>
3237///
3238/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3239///
3240/// \param __p
3241/// A pointer to a memory location that will receive the float values.
3242/// \param __m
3243/// A 128-bit integer vector containing the mask. The most significant bit of
3244/// each field in the mask vector represents the mask bits. If a mask bit is
3245/// zero, the corresponding value from vector \a __a is not stored and the
3246/// corresponding field in the memory location pointed to by \a __p is not
3247/// changed.
3248/// \param __a
3249/// A 128-bit vector of [2 x double] containing the values to be stored.
3250static __inline void __DEFAULT_FN_ATTRS128
3251_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3252{
3253 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3254}
3255
3256/// Moves double-precision values from a 256-bit vector of [4 x double]
3257/// to a memory location pointed to by \a __p, according to the specified
3258/// mask.
3259///
3260/// \headerfile <x86intrin.h>
3261///
3262/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3263///
3264/// \param __p
3265/// A pointer to a memory location that will receive the float values.
3266/// \param __m
3267/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3268/// significant bit of each quadword element in the mask vector represents
3269/// the mask bits. If a mask bit is zero, the corresponding value from vector
3270/// __a is not stored and the corresponding field in the memory location
3271/// pointed to by \a __p is not changed.
3272/// \param __a
3273/// A 256-bit vector of [4 x double] containing the values to be stored.
3274static __inline void __DEFAULT_FN_ATTRS
3275_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3276{
3277 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3278}
3279
3280/// Moves single-precision floating point values from a 128-bit vector
3281/// of [4 x float] to a memory location pointed to by \a __p, according to
3282/// the specified mask.
3283///
3284/// \headerfile <x86intrin.h>
3285///
3286/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3287///
3288/// \param __p
3289/// A pointer to a memory location that will receive the float values.
3290/// \param __m
3291/// A 128-bit integer vector containing the mask. The most significant bit of
3292/// each field in the mask vector represents the mask bits. If a mask bit is
3293/// zero, the corresponding value from vector __a is not stored and the
3294/// corresponding field in the memory location pointed to by \a __p is not
3295/// changed.
3296/// \param __a
3297/// A 128-bit vector of [4 x float] containing the values to be stored.
3298static __inline void __DEFAULT_FN_ATTRS128
3299_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3300{
3301 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3302}
3303
3304/* Cacheability support ops */
3305/// Moves integer data from a 256-bit integer vector to a 32-byte
3306/// aligned memory location. To minimize caching, the data is flagged as
3307/// non-temporal (unlikely to be used again soon).
3308///
3309/// \headerfile <x86intrin.h>
3310///
3311/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3312///
3313/// \param __a
3314/// A pointer to a 32-byte aligned memory location that will receive the
3315/// integer values.
3316/// \param __b
3317/// A 256-bit integer vector containing the values to be moved.
3318static __inline void __DEFAULT_FN_ATTRS
3320{
3321 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3322 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3323}
3324
3325/// Moves double-precision values from a 256-bit vector of [4 x double]
3326/// to a 32-byte aligned memory location. To minimize caching, the data is
3327/// flagged as non-temporal (unlikely to be used again soon).
3328///
3329/// \headerfile <x86intrin.h>
3330///
3331/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3332///
3333/// \param __a
3334/// A pointer to a 32-byte aligned memory location that will receive the
3335/// double-precision floating-point values.
3336/// \param __b
3337/// A 256-bit vector of [4 x double] containing the values to be moved.
3338static __inline void __DEFAULT_FN_ATTRS
3339_mm256_stream_pd(void *__a, __m256d __b)
3340{
3341 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3342 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3343}
3344
3345/// Moves single-precision floating point values from a 256-bit vector
3346/// of [8 x float] to a 32-byte aligned memory location. To minimize
3347/// caching, the data is flagged as non-temporal (unlikely to be used again
3348/// soon).
3349///
3350/// \headerfile <x86intrin.h>
3351///
3352/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3353///
3354/// \param __p
3355/// A pointer to a 32-byte aligned memory location that will receive the
3356/// single-precision floating point values.
3357/// \param __a
3358/// A 256-bit vector of [8 x float] containing the values to be moved.
3359static __inline void __DEFAULT_FN_ATTRS
3361{
3362 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3363 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3364}
3365
3366/* Create vectors */
3367/// Create a 256-bit vector of [4 x double] with undefined values.
3368///
3369/// \headerfile <x86intrin.h>
3370///
3371/// This intrinsic has no corresponding instruction.
3372///
3373/// \returns A 256-bit vector of [4 x double] containing undefined values.
3374static __inline__ __m256d __DEFAULT_FN_ATTRS
3376{
3377 return (__m256d)__builtin_ia32_undef256();
3378}
3379
3380/// Create a 256-bit vector of [8 x float] with undefined values.
3381///
3382/// \headerfile <x86intrin.h>
3383///
3384/// This intrinsic has no corresponding instruction.
3385///
3386/// \returns A 256-bit vector of [8 x float] containing undefined values.
3387static __inline__ __m256 __DEFAULT_FN_ATTRS
3389{
3390 return (__m256)__builtin_ia32_undef256();
3391}
3392
3393/// Create a 256-bit integer vector with undefined values.
3394///
3395/// \headerfile <x86intrin.h>
3396///
3397/// This intrinsic has no corresponding instruction.
3398///
3399/// \returns A 256-bit integer vector containing undefined values.
3400static __inline__ __m256i __DEFAULT_FN_ATTRS
3402{
3403 return (__m256i)__builtin_ia32_undef256();
3404}
3405
3406/// Constructs a 256-bit floating-point vector of [4 x double]
3407/// initialized with the specified double-precision floating-point values.
3408///
3409/// \headerfile <x86intrin.h>
3410///
3411/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3412/// instruction.
3413///
3414/// \param __a
3415/// A double-precision floating-point value used to initialize bits [255:192]
3416/// of the result.
3417/// \param __b
3418/// A double-precision floating-point value used to initialize bits [191:128]
3419/// of the result.
3420/// \param __c
3421/// A double-precision floating-point value used to initialize bits [127:64]
3422/// of the result.
3423/// \param __d
3424/// A double-precision floating-point value used to initialize bits [63:0]
3425/// of the result.
3426/// \returns An initialized 256-bit floating-point vector of [4 x double].
3427static __inline __m256d __DEFAULT_FN_ATTRS
3428_mm256_set_pd(double __a, double __b, double __c, double __d)
3429{
3430 return __extension__ (__m256d){ __d, __c, __b, __a };
3431}
3432
3433/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3434/// with the specified single-precision floating-point values.
3435///
3436/// \headerfile <x86intrin.h>
3437///
3438/// This intrinsic is a utility function and does not correspond to a specific
3439/// instruction.
3440///
3441/// \param __a
3442/// A single-precision floating-point value used to initialize bits [255:224]
3443/// of the result.
3444/// \param __b
3445/// A single-precision floating-point value used to initialize bits [223:192]
3446/// of the result.
3447/// \param __c
3448/// A single-precision floating-point value used to initialize bits [191:160]
3449/// of the result.
3450/// \param __d
3451/// A single-precision floating-point value used to initialize bits [159:128]
3452/// of the result.
3453/// \param __e
3454/// A single-precision floating-point value used to initialize bits [127:96]
3455/// of the result.
3456/// \param __f
3457/// A single-precision floating-point value used to initialize bits [95:64]
3458/// of the result.
3459/// \param __g
3460/// A single-precision floating-point value used to initialize bits [63:32]
3461/// of the result.
3462/// \param __h
3463/// A single-precision floating-point value used to initialize bits [31:0]
3464/// of the result.
3465/// \returns An initialized 256-bit floating-point vector of [8 x float].
3466static __inline __m256 __DEFAULT_FN_ATTRS
3467_mm256_set_ps(float __a, float __b, float __c, float __d,
3468 float __e, float __f, float __g, float __h)
3469{
3470 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3471}
3472
3473/// Constructs a 256-bit integer vector initialized with the specified
3474/// 32-bit integral values.
3475///
3476/// \headerfile <x86intrin.h>
3477///
3478/// This intrinsic is a utility function and does not correspond to a specific
3479/// instruction.
3480///
3481/// \param __i0
3482/// A 32-bit integral value used to initialize bits [255:224] of the result.
3483/// \param __i1
3484/// A 32-bit integral value used to initialize bits [223:192] of the result.
3485/// \param __i2
3486/// A 32-bit integral value used to initialize bits [191:160] of the result.
3487/// \param __i3
3488/// A 32-bit integral value used to initialize bits [159:128] of the result.
3489/// \param __i4
3490/// A 32-bit integral value used to initialize bits [127:96] of the result.
3491/// \param __i5
3492/// A 32-bit integral value used to initialize bits [95:64] of the result.
3493/// \param __i6
3494/// A 32-bit integral value used to initialize bits [63:32] of the result.
3495/// \param __i7
3496/// A 32-bit integral value used to initialize bits [31:0] of the result.
3497/// \returns An initialized 256-bit integer vector.
3498static __inline __m256i __DEFAULT_FN_ATTRS
3499_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3500 int __i4, int __i5, int __i6, int __i7)
3501{
3502 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3503}
3504
3505/// Constructs a 256-bit integer vector initialized with the specified
3506/// 16-bit integral values.
3507///
3508/// \headerfile <x86intrin.h>
3509///
3510/// This intrinsic is a utility function and does not correspond to a specific
3511/// instruction.
3512///
3513/// \param __w15
3514/// A 16-bit integral value used to initialize bits [255:240] of the result.
3515/// \param __w14
3516/// A 16-bit integral value used to initialize bits [239:224] of the result.
3517/// \param __w13
3518/// A 16-bit integral value used to initialize bits [223:208] of the result.
3519/// \param __w12
3520/// A 16-bit integral value used to initialize bits [207:192] of the result.
3521/// \param __w11
3522/// A 16-bit integral value used to initialize bits [191:176] of the result.
3523/// \param __w10
3524/// A 16-bit integral value used to initialize bits [175:160] of the result.
3525/// \param __w09
3526/// A 16-bit integral value used to initialize bits [159:144] of the result.
3527/// \param __w08
3528/// A 16-bit integral value used to initialize bits [143:128] of the result.
3529/// \param __w07
3530/// A 16-bit integral value used to initialize bits [127:112] of the result.
3531/// \param __w06
3532/// A 16-bit integral value used to initialize bits [111:96] of the result.
3533/// \param __w05
3534/// A 16-bit integral value used to initialize bits [95:80] of the result.
3535/// \param __w04
3536/// A 16-bit integral value used to initialize bits [79:64] of the result.
3537/// \param __w03
3538/// A 16-bit integral value used to initialize bits [63:48] of the result.
3539/// \param __w02
3540/// A 16-bit integral value used to initialize bits [47:32] of the result.
3541/// \param __w01
3542/// A 16-bit integral value used to initialize bits [31:16] of the result.
3543/// \param __w00
3544/// A 16-bit integral value used to initialize bits [15:0] of the result.
3545/// \returns An initialized 256-bit integer vector.
3546static __inline __m256i __DEFAULT_FN_ATTRS
3547_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3548 short __w11, short __w10, short __w09, short __w08,
3549 short __w07, short __w06, short __w05, short __w04,
3550 short __w03, short __w02, short __w01, short __w00)
3551{
3552 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3553 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3554}
3555
3556/// Constructs a 256-bit integer vector initialized with the specified
3557/// 8-bit integral values.
3558///
3559/// \headerfile <x86intrin.h>
3560///
3561/// This intrinsic is a utility function and does not correspond to a specific
3562/// instruction.
3563///
3564/// \param __b31
3565/// An 8-bit integral value used to initialize bits [255:248] of the result.
3566/// \param __b30
3567/// An 8-bit integral value used to initialize bits [247:240] of the result.
3568/// \param __b29
3569/// An 8-bit integral value used to initialize bits [239:232] of the result.
3570/// \param __b28
3571/// An 8-bit integral value used to initialize bits [231:224] of the result.
3572/// \param __b27
3573/// An 8-bit integral value used to initialize bits [223:216] of the result.
3574/// \param __b26
3575/// An 8-bit integral value used to initialize bits [215:208] of the result.
3576/// \param __b25
3577/// An 8-bit integral value used to initialize bits [207:200] of the result.
3578/// \param __b24
3579/// An 8-bit integral value used to initialize bits [199:192] of the result.
3580/// \param __b23
3581/// An 8-bit integral value used to initialize bits [191:184] of the result.
3582/// \param __b22
3583/// An 8-bit integral value used to initialize bits [183:176] of the result.
3584/// \param __b21
3585/// An 8-bit integral value used to initialize bits [175:168] of the result.
3586/// \param __b20
3587/// An 8-bit integral value used to initialize bits [167:160] of the result.
3588/// \param __b19
3589/// An 8-bit integral value used to initialize bits [159:152] of the result.
3590/// \param __b18
3591/// An 8-bit integral value used to initialize bits [151:144] of the result.
3592/// \param __b17
3593/// An 8-bit integral value used to initialize bits [143:136] of the result.
3594/// \param __b16
3595/// An 8-bit integral value used to initialize bits [135:128] of the result.
3596/// \param __b15
3597/// An 8-bit integral value used to initialize bits [127:120] of the result.
3598/// \param __b14
3599/// An 8-bit integral value used to initialize bits [119:112] of the result.
3600/// \param __b13
3601/// An 8-bit integral value used to initialize bits [111:104] of the result.
3602/// \param __b12
3603/// An 8-bit integral value used to initialize bits [103:96] of the result.
3604/// \param __b11
3605/// An 8-bit integral value used to initialize bits [95:88] of the result.
3606/// \param __b10
3607/// An 8-bit integral value used to initialize bits [87:80] of the result.
3608/// \param __b09
3609/// An 8-bit integral value used to initialize bits [79:72] of the result.
3610/// \param __b08
3611/// An 8-bit integral value used to initialize bits [71:64] of the result.
3612/// \param __b07
3613/// An 8-bit integral value used to initialize bits [63:56] of the result.
3614/// \param __b06
3615/// An 8-bit integral value used to initialize bits [55:48] of the result.
3616/// \param __b05
3617/// An 8-bit integral value used to initialize bits [47:40] of the result.
3618/// \param __b04
3619/// An 8-bit integral value used to initialize bits [39:32] of the result.
3620/// \param __b03
3621/// An 8-bit integral value used to initialize bits [31:24] of the result.
3622/// \param __b02
3623/// An 8-bit integral value used to initialize bits [23:16] of the result.
3624/// \param __b01
3625/// An 8-bit integral value used to initialize bits [15:8] of the result.
3626/// \param __b00
3627/// An 8-bit integral value used to initialize bits [7:0] of the result.
3628/// \returns An initialized 256-bit integer vector.
3629static __inline __m256i __DEFAULT_FN_ATTRS
3630_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3631 char __b27, char __b26, char __b25, char __b24,
3632 char __b23, char __b22, char __b21, char __b20,
3633 char __b19, char __b18, char __b17, char __b16,
3634 char __b15, char __b14, char __b13, char __b12,
3635 char __b11, char __b10, char __b09, char __b08,
3636 char __b07, char __b06, char __b05, char __b04,
3637 char __b03, char __b02, char __b01, char __b00)
3638{
3639 return __extension__ (__m256i)(__v32qi){
3640 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3641 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3642 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3643 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3644 };
3645}
3646
3647/// Constructs a 256-bit integer vector initialized with the specified
3648/// 64-bit integral values.
3649///
3650/// \headerfile <x86intrin.h>
3651///
3652/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3653/// instruction.
3654///
3655/// \param __a
3656/// A 64-bit integral value used to initialize bits [255:192] of the result.
3657/// \param __b
3658/// A 64-bit integral value used to initialize bits [191:128] of the result.
3659/// \param __c
3660/// A 64-bit integral value used to initialize bits [127:64] of the result.
3661/// \param __d
3662/// A 64-bit integral value used to initialize bits [63:0] of the result.
3663/// \returns An initialized 256-bit integer vector.
3664static __inline __m256i __DEFAULT_FN_ATTRS
3665_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3666{
3667 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3668}
3669
3670/* Create vectors with elements in reverse order */
3671/// Constructs a 256-bit floating-point vector of [4 x double],
3672/// initialized in reverse order with the specified double-precision
3673/// floating-point values.
3674///
3675/// \headerfile <x86intrin.h>
3676///
3677/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3678/// instruction.
3679///
3680/// \param __a
3681/// A double-precision floating-point value used to initialize bits [63:0]
3682/// of the result.
3683/// \param __b
3684/// A double-precision floating-point value used to initialize bits [127:64]
3685/// of the result.
3686/// \param __c
3687/// A double-precision floating-point value used to initialize bits [191:128]
3688/// of the result.
3689/// \param __d
3690/// A double-precision floating-point value used to initialize bits [255:192]
3691/// of the result.
3692/// \returns An initialized 256-bit floating-point vector of [4 x double].
3693static __inline __m256d __DEFAULT_FN_ATTRS
3694_mm256_setr_pd(double __a, double __b, double __c, double __d)
3695{
3696 return _mm256_set_pd(__d, __c, __b, __a);
3697}
3698
3699/// Constructs a 256-bit floating-point vector of [8 x float],
3700/// initialized in reverse order with the specified single-precision
3701/// float-point values.
3702///
3703/// \headerfile <x86intrin.h>
3704///
3705/// This intrinsic is a utility function and does not correspond to a specific
3706/// instruction.
3707///
3708/// \param __a
3709/// A single-precision floating-point value used to initialize bits [31:0]
3710/// of the result.
3711/// \param __b
3712/// A single-precision floating-point value used to initialize bits [63:32]
3713/// of the result.
3714/// \param __c
3715/// A single-precision floating-point value used to initialize bits [95:64]
3716/// of the result.
3717/// \param __d
3718/// A single-precision floating-point value used to initialize bits [127:96]
3719/// of the result.
3720/// \param __e
3721/// A single-precision floating-point value used to initialize bits [159:128]
3722/// of the result.
3723/// \param __f
3724/// A single-precision floating-point value used to initialize bits [191:160]
3725/// of the result.
3726/// \param __g
3727/// A single-precision floating-point value used to initialize bits [223:192]
3728/// of the result.
3729/// \param __h
3730/// A single-precision floating-point value used to initialize bits [255:224]
3731/// of the result.
3732/// \returns An initialized 256-bit floating-point vector of [8 x float].
3733static __inline __m256 __DEFAULT_FN_ATTRS
3734_mm256_setr_ps(float __a, float __b, float __c, float __d,
3735 float __e, float __f, float __g, float __h)
3736{
3737 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3738}
3739
3740/// Constructs a 256-bit integer vector, initialized in reverse order
3741/// with the specified 32-bit integral values.
3742///
3743/// \headerfile <x86intrin.h>
3744///
3745/// This intrinsic is a utility function and does not correspond to a specific
3746/// instruction.
3747///
3748/// \param __i0
3749/// A 32-bit integral value used to initialize bits [31:0] of the result.
3750/// \param __i1
3751/// A 32-bit integral value used to initialize bits [63:32] of the result.
3752/// \param __i2
3753/// A 32-bit integral value used to initialize bits [95:64] of the result.
3754/// \param __i3
3755/// A 32-bit integral value used to initialize bits [127:96] of the result.
3756/// \param __i4
3757/// A 32-bit integral value used to initialize bits [159:128] of the result.
3758/// \param __i5
3759/// A 32-bit integral value used to initialize bits [191:160] of the result.
3760/// \param __i6
3761/// A 32-bit integral value used to initialize bits [223:192] of the result.
3762/// \param __i7
3763/// A 32-bit integral value used to initialize bits [255:224] of the result.
3764/// \returns An initialized 256-bit integer vector.
3765static __inline __m256i __DEFAULT_FN_ATTRS
3766_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3767 int __i4, int __i5, int __i6, int __i7)
3768{
3769 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
3770}
3771
3772/// Constructs a 256-bit integer vector, initialized in reverse order
3773/// with the specified 16-bit integral values.
3774///
3775/// \headerfile <x86intrin.h>
3776///
3777/// This intrinsic is a utility function and does not correspond to a specific
3778/// instruction.
3779///
3780/// \param __w15
3781/// A 16-bit integral value used to initialize bits [15:0] of the result.
3782/// \param __w14
3783/// A 16-bit integral value used to initialize bits [31:16] of the result.
3784/// \param __w13
3785/// A 16-bit integral value used to initialize bits [47:32] of the result.
3786/// \param __w12
3787/// A 16-bit integral value used to initialize bits [63:48] of the result.
3788/// \param __w11
3789/// A 16-bit integral value used to initialize bits [79:64] of the result.
3790/// \param __w10
3791/// A 16-bit integral value used to initialize bits [95:80] of the result.
3792/// \param __w09
3793/// A 16-bit integral value used to initialize bits [111:96] of the result.
3794/// \param __w08
3795/// A 16-bit integral value used to initialize bits [127:112] of the result.
3796/// \param __w07
3797/// A 16-bit integral value used to initialize bits [143:128] of the result.
3798/// \param __w06
3799/// A 16-bit integral value used to initialize bits [159:144] of the result.
3800/// \param __w05
3801/// A 16-bit integral value used to initialize bits [175:160] of the result.
3802/// \param __w04
3803/// A 16-bit integral value used to initialize bits [191:176] of the result.
3804/// \param __w03
3805/// A 16-bit integral value used to initialize bits [207:192] of the result.
3806/// \param __w02
3807/// A 16-bit integral value used to initialize bits [223:208] of the result.
3808/// \param __w01
3809/// A 16-bit integral value used to initialize bits [239:224] of the result.
3810/// \param __w00
3811/// A 16-bit integral value used to initialize bits [255:240] of the result.
3812/// \returns An initialized 256-bit integer vector.
3813static __inline __m256i __DEFAULT_FN_ATTRS
3814_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
3815 short __w11, short __w10, short __w09, short __w08,
3816 short __w07, short __w06, short __w05, short __w04,
3817 short __w03, short __w02, short __w01, short __w00)
3818{
3819 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
3820 __w04, __w05, __w06, __w07,
3821 __w08, __w09, __w10, __w11,
3822 __w12, __w13, __w14, __w15);
3823}
3824
3825/// Constructs a 256-bit integer vector, initialized in reverse order
3826/// with the specified 8-bit integral values.
3827///
3828/// \headerfile <x86intrin.h>
3829///
3830/// This intrinsic is a utility function and does not correspond to a specific
3831/// instruction.
3832///
3833/// \param __b31
3834/// An 8-bit integral value used to initialize bits [7:0] of the result.
3835/// \param __b30
3836/// An 8-bit integral value used to initialize bits [15:8] of the result.
3837/// \param __b29
3838/// An 8-bit integral value used to initialize bits [23:16] of the result.
3839/// \param __b28
3840/// An 8-bit integral value used to initialize bits [31:24] of the result.
3841/// \param __b27
3842/// An 8-bit integral value used to initialize bits [39:32] of the result.
3843/// \param __b26
3844/// An 8-bit integral value used to initialize bits [47:40] of the result.
3845/// \param __b25
3846/// An 8-bit integral value used to initialize bits [55:48] of the result.
3847/// \param __b24
3848/// An 8-bit integral value used to initialize bits [63:56] of the result.
3849/// \param __b23
3850/// An 8-bit integral value used to initialize bits [71:64] of the result.
3851/// \param __b22
3852/// An 8-bit integral value used to initialize bits [79:72] of the result.
3853/// \param __b21
3854/// An 8-bit integral value used to initialize bits [87:80] of the result.
3855/// \param __b20
3856/// An 8-bit integral value used to initialize bits [95:88] of the result.
3857/// \param __b19
3858/// An 8-bit integral value used to initialize bits [103:96] of the result.
3859/// \param __b18
3860/// An 8-bit integral value used to initialize bits [111:104] of the result.
3861/// \param __b17
3862/// An 8-bit integral value used to initialize bits [119:112] of the result.
3863/// \param __b16
3864/// An 8-bit integral value used to initialize bits [127:120] of the result.
3865/// \param __b15
3866/// An 8-bit integral value used to initialize bits [135:128] of the result.
3867/// \param __b14
3868/// An 8-bit integral value used to initialize bits [143:136] of the result.
3869/// \param __b13
3870/// An 8-bit integral value used to initialize bits [151:144] of the result.
3871/// \param __b12
3872/// An 8-bit integral value used to initialize bits [159:152] of the result.
3873/// \param __b11
3874/// An 8-bit integral value used to initialize bits [167:160] of the result.
3875/// \param __b10
3876/// An 8-bit integral value used to initialize bits [175:168] of the result.
3877/// \param __b09
3878/// An 8-bit integral value used to initialize bits [183:176] of the result.
3879/// \param __b08
3880/// An 8-bit integral value used to initialize bits [191:184] of the result.
3881/// \param __b07
3882/// An 8-bit integral value used to initialize bits [199:192] of the result.
3883/// \param __b06
3884/// An 8-bit integral value used to initialize bits [207:200] of the result.
3885/// \param __b05
3886/// An 8-bit integral value used to initialize bits [215:208] of the result.
3887/// \param __b04
3888/// An 8-bit integral value used to initialize bits [223:216] of the result.
3889/// \param __b03
3890/// An 8-bit integral value used to initialize bits [231:224] of the result.
3891/// \param __b02
3892/// An 8-bit integral value used to initialize bits [239:232] of the result.
3893/// \param __b01
3894/// An 8-bit integral value used to initialize bits [247:240] of the result.
3895/// \param __b00
3896/// An 8-bit integral value used to initialize bits [255:248] of the result.
3897/// \returns An initialized 256-bit integer vector.
3898static __inline __m256i __DEFAULT_FN_ATTRS
3899_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
3900 char __b27, char __b26, char __b25, char __b24,
3901 char __b23, char __b22, char __b21, char __b20,
3902 char __b19, char __b18, char __b17, char __b16,
3903 char __b15, char __b14, char __b13, char __b12,
3904 char __b11, char __b10, char __b09, char __b08,
3905 char __b07, char __b06, char __b05, char __b04,
3906 char __b03, char __b02, char __b01, char __b00)
3907{
3908 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3909 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3910 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3911 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
3912}
3913
3914/// Constructs a 256-bit integer vector, initialized in reverse order
3915/// with the specified 64-bit integral values.
3916///
3917/// \headerfile <x86intrin.h>
3918///
3919/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3920/// instruction.
3921///
3922/// \param __a
3923/// A 64-bit integral value used to initialize bits [63:0] of the result.
3924/// \param __b
3925/// A 64-bit integral value used to initialize bits [127:64] of the result.
3926/// \param __c
3927/// A 64-bit integral value used to initialize bits [191:128] of the result.
3928/// \param __d
3929/// A 64-bit integral value used to initialize bits [255:192] of the result.
3930/// \returns An initialized 256-bit integer vector.
3931static __inline __m256i __DEFAULT_FN_ATTRS
3932_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
3933{
3934 return _mm256_set_epi64x(__d, __c, __b, __a);
3935}
3936
3937/* Create vectors with repeated elements */
3938/// Constructs a 256-bit floating-point vector of [4 x double], with each
3939/// of the four double-precision floating-point vector elements set to the
3940/// specified double-precision floating-point value.
3941///
3942/// \headerfile <x86intrin.h>
3943///
3944/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
3945///
3946/// \param __w
3947/// A double-precision floating-point value used to initialize each vector
3948/// element of the result.
3949/// \returns An initialized 256-bit floating-point vector of [4 x double].
3950static __inline __m256d __DEFAULT_FN_ATTRS
3952{
3953 return _mm256_set_pd(__w, __w, __w, __w);
3954}
3955
3956/// Constructs a 256-bit floating-point vector of [8 x float], with each
3957/// of the eight single-precision floating-point vector elements set to the
3958/// specified single-precision floating-point value.
3959///
3960/// \headerfile <x86intrin.h>
3961///
3962/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
3963/// instruction.
3964///
3965/// \param __w
3966/// A single-precision floating-point value used to initialize each vector
3967/// element of the result.
3968/// \returns An initialized 256-bit floating-point vector of [8 x float].
3969static __inline __m256 __DEFAULT_FN_ATTRS
3971{
3972 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
3973}
3974
3975/// Constructs a 256-bit integer vector of [8 x i32], with each of the
3976/// 32-bit integral vector elements set to the specified 32-bit integral
3977/// value.
3978///
3979/// \headerfile <x86intrin.h>
3980///
3981/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
3982/// instruction.
3983///
3984/// \param __i
3985/// A 32-bit integral value used to initialize each vector element of the
3986/// result.
3987/// \returns An initialized 256-bit integer vector of [8 x i32].
3988static __inline __m256i __DEFAULT_FN_ATTRS
3990{
3991 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
3992}
3993
3994/// Constructs a 256-bit integer vector of [16 x i16], with each of the
3995/// 16-bit integral vector elements set to the specified 16-bit integral
3996/// value.
3997///
3998/// \headerfile <x86intrin.h>
3999///
4000/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4001///
4002/// \param __w
4003/// A 16-bit integral value used to initialize each vector element of the
4004/// result.
4005/// \returns An initialized 256-bit integer vector of [16 x i16].
4006static __inline __m256i __DEFAULT_FN_ATTRS
4008{
4009 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4010 __w, __w, __w, __w, __w, __w, __w, __w);
4011}
4012
4013/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4014/// 8-bit integral vector elements set to the specified 8-bit integral value.
4015///
4016/// \headerfile <x86intrin.h>
4017///
4018/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4019///
4020/// \param __b
4021/// An 8-bit integral value used to initialize each vector element of the
4022/// result.
4023/// \returns An initialized 256-bit integer vector of [32 x i8].
4024static __inline __m256i __DEFAULT_FN_ATTRS
4026{
4027 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4028 __b, __b, __b, __b, __b, __b, __b, __b,
4029 __b, __b, __b, __b, __b, __b, __b, __b,
4030 __b, __b, __b, __b, __b, __b, __b, __b);
4031}
4032
4033/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4034/// 64-bit integral vector elements set to the specified 64-bit integral
4035/// value.
4036///
4037/// \headerfile <x86intrin.h>
4038///
4039/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4040///
4041/// \param __q
4042/// A 64-bit integral value used to initialize each vector element of the
4043/// result.
4044/// \returns An initialized 256-bit integer vector of [4 x i64].
4045static __inline __m256i __DEFAULT_FN_ATTRS
4047{
4048 return _mm256_set_epi64x(__q, __q, __q, __q);
4049}
4050
4051/* Create __zeroed vectors */
4052/// Constructs a 256-bit floating-point vector of [4 x double] with all
4053/// vector elements initialized to zero.
4054///
4055/// \headerfile <x86intrin.h>
4056///
4057/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4058///
4059/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4060static __inline __m256d __DEFAULT_FN_ATTRS
4062{
4063 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
4064}
4065
4066/// Constructs a 256-bit floating-point vector of [8 x float] with all
4067/// vector elements initialized to zero.
4068///
4069/// \headerfile <x86intrin.h>
4070///
4071/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4072///
4073/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4074static __inline __m256 __DEFAULT_FN_ATTRS
4076{
4077 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4078}
4079
4080/// Constructs a 256-bit integer vector initialized to zero.
4081///
4082/// \headerfile <x86intrin.h>
4083///
4084/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4085///
4086/// \returns A 256-bit integer vector initialized to zero.
4087static __inline __m256i __DEFAULT_FN_ATTRS
4089{
4090 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4091}
4092
4093/* Cast between vector types */
4094/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4095/// floating-point vector of [8 x float].
4096///
4097/// \headerfile <x86intrin.h>
4098///
4099/// This intrinsic has no corresponding instruction.
4100///
4101/// \param __a
4102/// A 256-bit floating-point vector of [4 x double].
4103/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4104/// bitwise pattern as the parameter.
4105static __inline __m256 __DEFAULT_FN_ATTRS
4107{
4108 return (__m256)__a;
4109}
4110
4111/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4112/// integer vector.
4113///
4114/// \headerfile <x86intrin.h>
4115///
4116/// This intrinsic has no corresponding instruction.
4117///
4118/// \param __a
4119/// A 256-bit floating-point vector of [4 x double].
4120/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4121/// parameter.
4122static __inline __m256i __DEFAULT_FN_ATTRS
4124{
4125 return (__m256i)__a;
4126}
4127
4128/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4129/// floating-point vector of [4 x double].
4130///
4131/// \headerfile <x86intrin.h>
4132///
4133/// This intrinsic has no corresponding instruction.
4134///
4135/// \param __a
4136/// A 256-bit floating-point vector of [8 x float].
4137/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4138/// bitwise pattern as the parameter.
4139static __inline __m256d __DEFAULT_FN_ATTRS
4141{
4142 return (__m256d)__a;
4143}
4144
4145/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4146/// integer vector.
4147///
4148/// \headerfile <x86intrin.h>
4149///
4150/// This intrinsic has no corresponding instruction.
4151///
4152/// \param __a
4153/// A 256-bit floating-point vector of [8 x float].
4154/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4155/// parameter.
4156static __inline __m256i __DEFAULT_FN_ATTRS
4158{
4159 return (__m256i)__a;
4160}
4161
4162/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4163/// of [8 x float].
4164///
4165/// \headerfile <x86intrin.h>
4166///
4167/// This intrinsic has no corresponding instruction.
4168///
4169/// \param __a
4170/// A 256-bit integer vector.
4171/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4172/// bitwise pattern as the parameter.
4173static __inline __m256 __DEFAULT_FN_ATTRS
4175{
4176 return (__m256)__a;
4177}
4178
4179/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4180/// of [4 x double].
4181///
4182/// \headerfile <x86intrin.h>
4183///
4184/// This intrinsic has no corresponding instruction.
4185///
4186/// \param __a
4187/// A 256-bit integer vector.
4188/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4189/// bitwise pattern as the parameter.
4190static __inline __m256d __DEFAULT_FN_ATTRS
4192{
4193 return (__m256d)__a;
4194}
4195
4196/// Returns the lower 128 bits of a 256-bit floating-point vector of
4197/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4198///
4199/// \headerfile <x86intrin.h>
4200///
4201/// This intrinsic has no corresponding instruction.
4202///
4203/// \param __a
4204/// A 256-bit floating-point vector of [4 x double].
4205/// \returns A 128-bit floating-point vector of [2 x double] containing the
4206/// lower 128 bits of the parameter.
4207static __inline __m128d __DEFAULT_FN_ATTRS
4209{
4210 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4211}
4212
4213/// Returns the lower 128 bits of a 256-bit floating-point vector of
4214/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4215///
4216/// \headerfile <x86intrin.h>
4217///
4218/// This intrinsic has no corresponding instruction.
4219///
4220/// \param __a
4221/// A 256-bit floating-point vector of [8 x float].
4222/// \returns A 128-bit floating-point vector of [4 x float] containing the
4223/// lower 128 bits of the parameter.
4224static __inline __m128 __DEFAULT_FN_ATTRS
4226{
4227 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4228}
4229
4230/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4231///
4232/// \headerfile <x86intrin.h>
4233///
4234/// This intrinsic has no corresponding instruction.
4235///
4236/// \param __a
4237/// A 256-bit integer vector.
4238/// \returns A 128-bit integer vector containing the lower 128 bits of the
4239/// parameter.
4240static __inline __m128i __DEFAULT_FN_ATTRS
4242{
4243 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4244}
4245
4246/// Constructs a 256-bit floating-point vector of [4 x double] from a
4247/// 128-bit floating-point vector of [2 x double].
4248///
4249/// The lower 128 bits contain the value of the source vector. The contents
4250/// of the upper 128 bits are undefined.
4251///
4252/// \headerfile <x86intrin.h>
4253///
4254/// This intrinsic has no corresponding instruction.
4255///
4256/// \param __a
4257/// A 128-bit vector of [2 x double].
4258/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4259/// contain the value of the parameter. The contents of the upper 128 bits
4260/// are undefined.
4261static __inline __m256d __DEFAULT_FN_ATTRS
4263{
4264 return __builtin_shufflevector(
4265 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4266}
4267
4268/// Constructs a 256-bit floating-point vector of [8 x float] from a
4269/// 128-bit floating-point vector of [4 x float].
4270///
4271/// The lower 128 bits contain the value of the source vector. The contents
4272/// of the upper 128 bits are undefined.
4273///
4274/// \headerfile <x86intrin.h>
4275///
4276/// This intrinsic has no corresponding instruction.
4277///
4278/// \param __a
4279/// A 128-bit vector of [4 x float].
4280/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4281/// contain the value of the parameter. The contents of the upper 128 bits
4282/// are undefined.
4283static __inline __m256 __DEFAULT_FN_ATTRS
4285{
4286 return __builtin_shufflevector((__v4sf)__a,
4287 (__v4sf)__builtin_nondeterministic_value(__a),
4288 0, 1, 2, 3, 4, 5, 6, 7);
4289}
4290
4291/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4292///
4293/// The lower 128 bits contain the value of the source vector. The contents
4294/// of the upper 128 bits are undefined.
4295///
4296/// \headerfile <x86intrin.h>
4297///
4298/// This intrinsic has no corresponding instruction.
4299///
4300/// \param __a
4301/// A 128-bit integer vector.
4302/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4303/// the parameter. The contents of the upper 128 bits are undefined.
4304static __inline __m256i __DEFAULT_FN_ATTRS
4306{
4307 return __builtin_shufflevector(
4308 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4309}
4310
4311/// Constructs a 256-bit floating-point vector of [4 x double] from a
4312/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4313/// contain the value of the source vector. The upper 128 bits are set
4314/// to zero.
4315///
4316/// \headerfile <x86intrin.h>
4317///
4318/// This intrinsic has no corresponding instruction.
4319///
4320/// \param __a
4321/// A 128-bit vector of [2 x double].
4322/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4323/// contain the value of the parameter. The upper 128 bits are set to zero.
4324static __inline __m256d __DEFAULT_FN_ATTRS
4326{
4327 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4328}
4329
4330/// Constructs a 256-bit floating-point vector of [8 x float] from a
4331/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4332/// the value of the source vector. The upper 128 bits are set to zero.
4333///
4334/// \headerfile <x86intrin.h>
4335///
4336/// This intrinsic has no corresponding instruction.
4337///
4338/// \param __a
4339/// A 128-bit vector of [4 x float].
4340/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4341/// contain the value of the parameter. The upper 128 bits are set to zero.
4342static __inline __m256 __DEFAULT_FN_ATTRS
4344{
4345 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4346}
4347
4348/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4349/// The lower 128 bits contain the value of the source vector. The upper
4350/// 128 bits are set to zero.
4351///
4352/// \headerfile <x86intrin.h>
4353///
4354/// This intrinsic has no corresponding instruction.
4355///
4356/// \param __a
4357/// A 128-bit integer vector.
4358/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4359/// the parameter. The upper 128 bits are set to zero.
4360static __inline __m256i __DEFAULT_FN_ATTRS
4362{
4363 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4364}
4365
4366/*
4367 Vector insert.
4368 We use macros rather than inlines because we only want to accept
4369 invocations where the immediate M is a constant expression.
4370*/
4371/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4372/// a 256-bit vector of [8 x float] given in the first parameter, and then
4373/// replacing either the upper or the lower 128 bits with the contents of a
4374/// 128-bit vector of [4 x float] in the second parameter.
4375///
4376/// The immediate integer parameter determines between the upper or the lower
4377/// 128 bits.
4378///
4379/// \headerfile <x86intrin.h>
4380///
4381/// \code
4382/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4383/// \endcode
4384///
4385/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4386///
4387/// \param V1
4388/// A 256-bit vector of [8 x float]. This vector is copied to the result
4389/// first, and then either the upper or the lower 128 bits of the result will
4390/// be replaced by the contents of \a V2.
4391/// \param V2
4392/// A 128-bit vector of [4 x float]. The contents of this parameter are
4393/// written to either the upper or the lower 128 bits of the result depending
4394/// on the value of parameter \a M.
4395/// \param M
4396/// An immediate integer. The least significant bit determines how the values
4397/// from the two parameters are interleaved: \n
4398/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4399/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4400/// result. \n
4401/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4402/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4403/// result.
4404/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4405#define _mm256_insertf128_ps(V1, V2, M) \
4406 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4407 (__v4sf)(__m128)(V2), (int)(M)))
4408
4409/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4410/// a 256-bit vector of [4 x double] given in the first parameter, and then
4411/// replacing either the upper or the lower 128 bits with the contents of a
4412/// 128-bit vector of [2 x double] in the second parameter.
4413///
4414/// The immediate integer parameter determines between the upper or the lower
4415/// 128 bits.
4416///
4417/// \headerfile <x86intrin.h>
4418///
4419/// \code
4420/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4421/// \endcode
4422///
4423/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4424///
4425/// \param V1
4426/// A 256-bit vector of [4 x double]. This vector is copied to the result
4427/// first, and then either the upper or the lower 128 bits of the result will
4428/// be replaced by the contents of \a V2.
4429/// \param V2
4430/// A 128-bit vector of [2 x double]. The contents of this parameter are
4431/// written to either the upper or the lower 128 bits of the result depending
4432/// on the value of parameter \a M.
4433/// \param M
4434/// An immediate integer. The least significant bit determines how the values
4435/// from the two parameters are interleaved: \n
4436/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4437/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4438/// result. \n
4439/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4440/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4441/// result.
4442/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4443#define _mm256_insertf128_pd(V1, V2, M) \
4444 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4445 (__v2df)(__m128d)(V2), (int)(M)))
4446
4447/// Constructs a new 256-bit integer vector by first duplicating a
4448/// 256-bit integer vector given in the first parameter, and then replacing
4449/// either the upper or the lower 128 bits with the contents of a 128-bit
4450/// integer vector in the second parameter.
4451///
4452/// The immediate integer parameter determines between the upper or the lower
4453/// 128 bits.
4454///
4455/// \headerfile <x86intrin.h>
4456///
4457/// \code
4458/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4459/// \endcode
4460///
4461/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4462///
4463/// \param V1
4464/// A 256-bit integer vector. This vector is copied to the result first, and
4465/// then either the upper or the lower 128 bits of the result will be
4466/// replaced by the contents of \a V2.
4467/// \param V2
4468/// A 128-bit integer vector. The contents of this parameter are written to
4469/// either the upper or the lower 128 bits of the result depending on the
4470/// value of parameter \a M.
4471/// \param M
4472/// An immediate integer. The least significant bit determines how the values
4473/// from the two parameters are interleaved: \n
4474/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4475/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4476/// result. \n
4477/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4478/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4479/// result.
4480/// \returns A 256-bit integer vector containing the interleaved values.
4481#define _mm256_insertf128_si256(V1, V2, M) \
4482 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4483 (__v4si)(__m128i)(V2), (int)(M)))
4484
4485/*
4486 Vector extract.
4487 We use macros rather than inlines because we only want to accept
4488 invocations where the immediate M is a constant expression.
4489*/
4490/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4491/// of [8 x float], as determined by the immediate integer parameter, and
4492/// returns the extracted bits as a 128-bit vector of [4 x float].
4493///
4494/// \headerfile <x86intrin.h>
4495///
4496/// \code
4497/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4498/// \endcode
4499///
4500/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4501///
4502/// \param V
4503/// A 256-bit vector of [8 x float].
4504/// \param M
4505/// An immediate integer. The least significant bit determines which bits are
4506/// extracted from the first parameter: \n
4507/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4508/// result. \n
4509/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4510/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4511#define _mm256_extractf128_ps(V, M) \
4512 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4513
4514/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4515/// of [4 x double], as determined by the immediate integer parameter, and
4516/// returns the extracted bits as a 128-bit vector of [2 x double].
4517///
4518/// \headerfile <x86intrin.h>
4519///
4520/// \code
4521/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4522/// \endcode
4523///
4524/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4525///
4526/// \param V
4527/// A 256-bit vector of [4 x double].
4528/// \param M
4529/// An immediate integer. The least significant bit determines which bits are
4530/// extracted from the first parameter: \n
4531/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4532/// result. \n
4533/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4534/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4535#define _mm256_extractf128_pd(V, M) \
4536 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4537
4538/// Extracts either the upper or the lower 128 bits from a 256-bit
4539/// integer vector, as determined by the immediate integer parameter, and
4540/// returns the extracted bits as a 128-bit integer vector.
4541///
4542/// \headerfile <x86intrin.h>
4543///
4544/// \code
4545/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4546/// \endcode
4547///
4548/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4549///
4550/// \param V
4551/// A 256-bit integer vector.
4552/// \param M
4553/// An immediate integer. The least significant bit determines which bits are
4554/// extracted from the first parameter: \n
4555/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4556/// result. \n
4557/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4558/// \returns A 128-bit integer vector containing the extracted bits.
4559#define _mm256_extractf128_si256(V, M) \
4560 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4561
4562/// Constructs a 256-bit floating-point vector of [8 x float] by
4563/// concatenating two 128-bit floating-point vectors of [4 x float].
4564///
4565/// \headerfile <x86intrin.h>
4566///
4567/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4568///
4569/// \param __hi
4570/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4571/// 128 bits of the result.
4572/// \param __lo
4573/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4574/// 128 bits of the result.
4575/// \returns A 256-bit floating-point vector of [8 x float] containing the
4576/// concatenated result.
4577static __inline __m256 __DEFAULT_FN_ATTRS
4578_mm256_set_m128 (__m128 __hi, __m128 __lo)
4579{
4580 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4581}
4582
4583/// Constructs a 256-bit floating-point vector of [4 x double] by
4584/// concatenating two 128-bit floating-point vectors of [2 x double].
4585///
4586/// \headerfile <x86intrin.h>
4587///
4588/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4589///
4590/// \param __hi
4591/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4592/// 128 bits of the result.
4593/// \param __lo
4594/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4595/// 128 bits of the result.
4596/// \returns A 256-bit floating-point vector of [4 x double] containing the
4597/// concatenated result.
4598static __inline __m256d __DEFAULT_FN_ATTRS
4599_mm256_set_m128d (__m128d __hi, __m128d __lo)
4600{
4601 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4602}
4603
4604/// Constructs a 256-bit integer vector by concatenating two 128-bit
4605/// integer vectors.
4606///
4607/// \headerfile <x86intrin.h>
4608///
4609/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4610///
4611/// \param __hi
4612/// A 128-bit integer vector to be copied to the upper 128 bits of the
4613/// result.
4614/// \param __lo
4615/// A 128-bit integer vector to be copied to the lower 128 bits of the
4616/// result.
4617/// \returns A 256-bit integer vector containing the concatenated result.
4618static __inline __m256i __DEFAULT_FN_ATTRS
4619_mm256_set_m128i (__m128i __hi, __m128i __lo)
4620{
4621 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4622}
4623
4624/// Constructs a 256-bit floating-point vector of [8 x float] by
4625/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4626/// similar to _mm256_set_m128, but the order of the input parameters is
4627/// swapped.
4628///
4629/// \headerfile <x86intrin.h>
4630///
4631/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4632///
4633/// \param __lo
4634/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4635/// 128 bits of the result.
4636/// \param __hi
4637/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4638/// 128 bits of the result.
4639/// \returns A 256-bit floating-point vector of [8 x float] containing the
4640/// concatenated result.
4641static __inline __m256 __DEFAULT_FN_ATTRS
4642_mm256_setr_m128 (__m128 __lo, __m128 __hi)
4643{
4644 return _mm256_set_m128(__hi, __lo);
4645}
4646
4647/// Constructs a 256-bit floating-point vector of [4 x double] by
4648/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4649/// similar to _mm256_set_m128d, but the order of the input parameters is
4650/// swapped.
4651///
4652/// \headerfile <x86intrin.h>
4653///
4654/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4655///
4656/// \param __lo
4657/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4658/// 128 bits of the result.
4659/// \param __hi
4660/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4661/// 128 bits of the result.
4662/// \returns A 256-bit floating-point vector of [4 x double] containing the
4663/// concatenated result.
4664static __inline __m256d __DEFAULT_FN_ATTRS
4665_mm256_setr_m128d (__m128d __lo, __m128d __hi)
4666{
4667 return (__m256d)_mm256_set_m128d(__hi, __lo);
4668}
4669
4670/// Constructs a 256-bit integer vector by concatenating two 128-bit
4671/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4672/// the input parameters is swapped.
4673///
4674/// \headerfile <x86intrin.h>
4675///
4676/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4677///
4678/// \param __lo
4679/// A 128-bit integer vector to be copied to the lower 128 bits of the
4680/// result.
4681/// \param __hi
4682/// A 128-bit integer vector to be copied to the upper 128 bits of the
4683/// result.
4684/// \returns A 256-bit integer vector containing the concatenated result.
4685static __inline __m256i __DEFAULT_FN_ATTRS
4686_mm256_setr_m128i (__m128i __lo, __m128i __hi)
4687{
4688 return (__m256i)_mm256_set_m128i(__hi, __lo);
4689}
4690
4691/* SIMD load ops (unaligned) */
4692/// Loads two 128-bit floating-point vectors of [4 x float] from
4693/// unaligned memory locations and constructs a 256-bit floating-point vector
4694/// of [8 x float] by concatenating the two 128-bit vectors.
4695///
4696/// \headerfile <x86intrin.h>
4697///
4698/// This intrinsic corresponds to load instructions followed by the
4699/// <c> VINSERTF128 </c> instruction.
4700///
4701/// \param __addr_hi
4702/// A pointer to a 128-bit memory location containing 4 consecutive
4703/// single-precision floating-point values. These values are to be copied to
4704/// bits[255:128] of the result. The address of the memory location does not
4705/// have to be aligned.
4706/// \param __addr_lo
4707/// A pointer to a 128-bit memory location containing 4 consecutive
4708/// single-precision floating-point values. These values are to be copied to
4709/// bits[127:0] of the result. The address of the memory location does not
4710/// have to be aligned.
4711/// \returns A 256-bit floating-point vector of [8 x float] containing the
4712/// concatenated result.
4713static __inline __m256 __DEFAULT_FN_ATTRS
4714_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4715{
4716 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4717}
4718
4719/// Loads two 128-bit floating-point vectors of [2 x double] from
4720/// unaligned memory locations and constructs a 256-bit floating-point vector
4721/// of [4 x double] by concatenating the two 128-bit vectors.
4722///
4723/// \headerfile <x86intrin.h>
4724///
4725/// This intrinsic corresponds to load instructions followed by the
4726/// <c> VINSERTF128 </c> instruction.
4727///
4728/// \param __addr_hi
4729/// A pointer to a 128-bit memory location containing two consecutive
4730/// double-precision floating-point values. These values are to be copied to
4731/// bits[255:128] of the result. The address of the memory location does not
4732/// have to be aligned.
4733/// \param __addr_lo
4734/// A pointer to a 128-bit memory location containing two consecutive
4735/// double-precision floating-point values. These values are to be copied to
4736/// bits[127:0] of the result. The address of the memory location does not
4737/// have to be aligned.
4738/// \returns A 256-bit floating-point vector of [4 x double] containing the
4739/// concatenated result.
4740static __inline __m256d __DEFAULT_FN_ATTRS
4741_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4742{
4743 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
4744}
4745
4746/// Loads two 128-bit integer vectors from unaligned memory locations and
4747/// constructs a 256-bit integer vector by concatenating the two 128-bit
4748/// vectors.
4749///
4750/// \headerfile <x86intrin.h>
4751///
4752/// This intrinsic corresponds to load instructions followed by the
4753/// <c> VINSERTF128 </c> instruction.
4754///
4755/// \param __addr_hi
4756/// A pointer to a 128-bit memory location containing a 128-bit integer
4757/// vector. This vector is to be copied to bits[255:128] of the result. The
4758/// address of the memory location does not have to be aligned.
4759/// \param __addr_lo
4760/// A pointer to a 128-bit memory location containing a 128-bit integer
4761/// vector. This vector is to be copied to bits[127:0] of the result. The
4762/// address of the memory location does not have to be aligned.
4763/// \returns A 256-bit integer vector containing the concatenated result.
4764static __inline __m256i __DEFAULT_FN_ATTRS
4765_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
4766{
4767 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
4768}
4769
4770/* SIMD store ops (unaligned) */
4771/// Stores the upper and lower 128 bits of a 256-bit floating-point
4772/// vector of [8 x float] into two different unaligned memory locations.
4773///
4774/// \headerfile <x86intrin.h>
4775///
4776/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4777/// store instructions.
4778///
4779/// \param __addr_hi
4780/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4781/// copied to this memory location. The address of this memory location does
4782/// not have to be aligned.
4783/// \param __addr_lo
4784/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4785/// copied to this memory location. The address of this memory location does
4786/// not have to be aligned.
4787/// \param __a
4788/// A 256-bit floating-point vector of [8 x float].
4789static __inline void __DEFAULT_FN_ATTRS
4790_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
4791{
4792 __m128 __v128;
4793
4794 __v128 = _mm256_castps256_ps128(__a);
4795 _mm_storeu_ps(__addr_lo, __v128);
4796 __v128 = _mm256_extractf128_ps(__a, 1);
4797 _mm_storeu_ps(__addr_hi, __v128);
4798}
4799
4800/// Stores the upper and lower 128 bits of a 256-bit floating-point
4801/// vector of [4 x double] into two different unaligned memory locations.
4802///
4803/// \headerfile <x86intrin.h>
4804///
4805/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4806/// store instructions.
4807///
4808/// \param __addr_hi
4809/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4810/// copied to this memory location. The address of this memory location does
4811/// not have to be aligned.
4812/// \param __addr_lo
4813/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4814/// copied to this memory location. The address of this memory location does
4815/// not have to be aligned.
4816/// \param __a
4817/// A 256-bit floating-point vector of [4 x double].
4818static __inline void __DEFAULT_FN_ATTRS
4819_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
4820{
4821 __m128d __v128;
4822
4823 __v128 = _mm256_castpd256_pd128(__a);
4824 _mm_storeu_pd(__addr_lo, __v128);
4825 __v128 = _mm256_extractf128_pd(__a, 1);
4826 _mm_storeu_pd(__addr_hi, __v128);
4827}
4828
4829/// Stores the upper and lower 128 bits of a 256-bit integer vector into
4830/// two different unaligned memory locations.
4831///
4832/// \headerfile <x86intrin.h>
4833///
4834/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4835/// store instructions.
4836///
4837/// \param __addr_hi
4838/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4839/// copied to this memory location. The address of this memory location does
4840/// not have to be aligned.
4841/// \param __addr_lo
4842/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4843/// copied to this memory location. The address of this memory location does
4844/// not have to be aligned.
4845/// \param __a
4846/// A 256-bit integer vector.
4847static __inline void __DEFAULT_FN_ATTRS
4848_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
4849{
4850 __m128i __v128;
4851
4852 __v128 = _mm256_castsi256_si128(__a);
4853 _mm_storeu_si128(__addr_lo, __v128);
4854 __v128 = _mm256_extractf128_si256(__a, 1);
4855 _mm_storeu_si128(__addr_hi, __v128);
4856}
4857
4858#undef __DEFAULT_FN_ATTRS
4859#undef __DEFAULT_FN_ATTRS128
4860
4861#endif /* __AVXINTRIN_H */
__device__ _Float16
__INLINE unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:59
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:2793
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4578
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition: avxintrin.h:2837
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition: avxintrin.h:732
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition: avxintrin.h:3043
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2665
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition: avxintrin.h:88
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition: avxintrin.h:3339
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition: avxintrin.h:4174
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition: avxintrin.h:2857
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: avxintrin.h:3899
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition: avxintrin.h:2023
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition: avxintrin.h:3007
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4361
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition: avxintrin.h:3063
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition: avxintrin.h:2206
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition: avxintrin.h:4714
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:344
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3153
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition: avxintrin.h:3970
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition: avxintrin.h:4157
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:644
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition: avxintrin.h:1942
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:378
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition: avxintrin.h:2139
static __inline __m128d __DEFAULT_FN_ATTRS _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition: avxintrin.h:4208
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition: avxintrin.h:2711
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition: avxintrin.h:1958
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition: avxintrin.h:4075
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition: avxintrin.h:3388
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2613
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition: avxintrin.h:4225
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3178
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3129
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition: avxintrin.h:4025
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition: avxintrin.h:969
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition: avxintrin.h:1989
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition: avxintrin.h:3101
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float],...
Definition: avxintrin.h:4511
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition: avxintrin.h:4559
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition: avxintrin.h:2950
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2692
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition: avxintrin.h:1393
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4599
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition: avxintrin.h:4106
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition: avxintrin.h:3951
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition: avxintrin.h:3319
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition: avxintrin.h:878
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition: avxintrin.h:3375
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:361
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: avxintrin.h:3814
#define __DEFAULT_FN_ATTRS
Definition: avxintrin.h:53
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition: avxintrin.h:3251
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4262
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32], truncating the result b...
Definition: avxintrin.h:2006
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition: avxintrin.h:3401
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition: avxintrin.h:3734
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition: avxintrin.h:1973
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition: avxintrin.h:4046
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition: avxintrin.h:4191
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2349
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values.
Definition: avxintrin.h:274
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition: avxintrin.h:3428
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2583
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:2893
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:662
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double],...
Definition: avxintrin.h:4535
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32], truncating the result by rounding toward...
Definition: avxintrin.h:2039
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4284
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition: avxintrin.h:4686
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition: avxintrin.h:124
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values.
Definition: avxintrin.h:3665
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:2815
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values.
Definition: avxintrin.h:236
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2525
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition: avxintrin.h:4741
static __inline float __DEFAULT_FN_ATTRS _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition: avxintrin.h:2088
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition: avxintrin.h:2114
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition: avxintrin.h:180
static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2437
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition: avxintrin.h:4819
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition: avxintrin.h:327
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition: avxintrin.h:143
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition: avxintrin.h:709
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition: avxintrin.h:162
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2466
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition: avxintrin.h:292
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:548
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition: avxintrin.h:3275
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition: avxintrin.h:3227
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:626
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:2910
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors.
Definition: avxintrin.h:2639
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4325
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: avxintrin.h:3932
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition: avxintrin.h:4140
static __inline double __DEFAULT_FN_ATTRS _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition: avxintrin.h:2055
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double].
Definition: avxintrin.h:686
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float].
Definition: avxintrin.h:755
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2495
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition: avxintrin.h:1927
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4665
#define __DEFAULT_FN_ATTRS128
Definition: avxintrin.h:54
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
Definition: avxintrin.h:569
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: avxintrin.h:3766
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition: avxintrin.h:1421
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition: avxintrin.h:2184
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition: avxintrin.h:4848
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition: avxintrin.h:106
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors.
Definition: avxintrin.h:4619
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:608
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2290
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition: avxintrin.h:4061
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition: avxintrin.h:2161
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values.
Definition: avxintrin.h:3547
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition: avxintrin.h:4765
static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition: avxintrin.h:785
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition: avxintrin.h:4088
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition: avxintrin.h:4123
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3202
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition: avxintrin.h:198
static __inline int __DEFAULT_FN_ATTRS _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition: avxintrin.h:2071
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition: avxintrin.h:3360
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4642
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:2966
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition: avxintrin.h:3084
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values.
Definition: avxintrin.h:255
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition: avxintrin.h:4241
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values.
Definition: avxintrin.h:3499
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4305
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2233
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values.
Definition: avxintrin.h:3630
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:2930
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition: avxintrin.h:3467
static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:2771
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2407
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition: avxintrin.h:2729
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition: avxintrin.h:310
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2260
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition: avxintrin.h:824
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values.
Definition: avxintrin.h:217
static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2378
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition: avxintrin.h:4007
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i_u const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:2987
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:530
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition: avxintrin.h:3025
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition: avxintrin.h:4790
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition: avxintrin.h:3989
double __v4df __attribute__((__vector_size__(32)))
Definition: avxintrin.h:17
static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2319
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2554
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4343
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition: avxintrin.h:70
static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition: avxintrin.h:3299
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:2877
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
Definition: avxintrin.h:590
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition: avxintrin.h:3694
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1573
static __inline__ void int __a
Definition: emmintrin.h:3986
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3369
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1821
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1934
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3787
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3818
struct __storeu_i16 *__P __v
Definition: immintrin.h:504
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:1907
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:1987
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1744